reconstruct code for GPU ops

This commit is contained in:
Pengyongrong 2020-11-10 01:01:39 -08:00
parent ae62d15bc0
commit d210921115
17 changed files with 401 additions and 616 deletions

View File

@ -17,22 +17,6 @@ __kernel void ArithmeticSelf_ElementAbs_NHWC4(__read_only image2d_t input0, __wr
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
}
__kernel void ArithmeticSelf_ElementAbs_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(2); // c/4
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
result.x = result.x >= 0 ? result.x : -result.x;
result.y = result.y >= 0 ? result.y : -result.y;
result.z = result.z >= 0 ? result.z : -result.z;
result.w = result.w >= 0 ? result.w : -result.w;
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
}
__kernel void ArithmeticSelf_ElementCos_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
@ -49,22 +33,6 @@ __kernel void ArithmeticSelf_ElementCos_NHWC4(__read_only image2d_t input0, __wr
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
}
__kernel void ArithmeticSelf_ElementCos_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(2); // c/4
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
result.x = cos(result.x);
result.y = cos(result.y);
result.z = cos(result.z);
result.w = cos(result.w);
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
}
__kernel void ArithmeticSelf_ElementSin_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
@ -81,22 +49,6 @@ __kernel void ArithmeticSelf_ElementSin_NHWC4(__read_only image2d_t input0, __wr
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
}
__kernel void ArithmeticSelf_ElementSin_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(2); // c/4
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
result.x = sin(result.x);
result.y = sin(result.y);
result.z = sin(result.z);
result.w = sin(result.w);
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
}
__kernel void ArithmeticSelf_ElementNeg_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
@ -113,22 +65,6 @@ __kernel void ArithmeticSelf_ElementNeg_NHWC4(__read_only image2d_t input0, __wr
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
}
__kernel void ArithmeticSelf_ElementNeg_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(2); // c/4
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
result.x = -result.x;
result.y = -result.y;
result.z = -result.z;
result.w = -result.w;
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
}
__kernel void ArithmeticSelf_ElementExp_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
@ -145,22 +81,6 @@ __kernel void ArithmeticSelf_ElementExp_NHWC4(__read_only image2d_t input0, __wr
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
}
__kernel void ArithmeticSelf_ElementExp_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(2); // c/4
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
result.x = exp(result.x);
result.y = exp(result.y);
result.z = exp(result.z);
result.w = exp(result.w);
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
}
__kernel void ArithmeticSelf_ElementLog_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
@ -177,22 +97,6 @@ __kernel void ArithmeticSelf_ElementLog_NHWC4(__read_only image2d_t input0, __wr
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
}
__kernel void ArithmeticSelf_ElementLog_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(2); // c/4
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
result.x = result.x > 0 ? log(result.x) : HUGE_VALF;
result.y = result.y > 0 ? log(result.y) : HUGE_VALF;
result.z = result.z > 0 ? log(result.z) : HUGE_VALF;
result.w = result.w > 0 ? log(result.w) : HUGE_VALF;
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
}
__kernel void ArithmeticSelf_ElementSquare_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
@ -209,22 +113,6 @@ __kernel void ArithmeticSelf_ElementSquare_NHWC4(__read_only image2d_t input0, _
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
}
__kernel void ArithmeticSelf_ElementSquare_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(2); // c/4
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
result.x = result.x * result.x;
result.y = result.y * result.y;
result.z = result.z * result.z;
result.w = result.w * result.w;
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
}
__kernel void ArithmeticSelf_ElementSqrt_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
@ -241,22 +129,6 @@ __kernel void ArithmeticSelf_ElementSqrt_NHWC4(__read_only image2d_t input0, __w
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
}
__kernel void ArithmeticSelf_ElementSqrt_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(2); // c/4
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
result.x = result.x > 0 ? sqrt(result.x) : HUGE_VALF;
result.y = result.y > 0 ? sqrt(result.y) : HUGE_VALF;
result.z = result.z > 0 ? sqrt(result.z) : HUGE_VALF;
result.w = result.w > 0 ? sqrt(result.w) : HUGE_VALF;
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
}
__kernel void ArithmeticSelf_ElementRsqrt_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
@ -273,22 +145,6 @@ __kernel void ArithmeticSelf_ElementRsqrt_NHWC4(__read_only image2d_t input0, __
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
}
__kernel void ArithmeticSelf_ElementRsqrt_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(2); // c/4
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
result.x = result.x > 0 ? 1.0f / sqrt(result.x) : HUGE_VALF;
result.y = result.y > 0 ? 1.0f / sqrt(result.y) : HUGE_VALF;
result.z = result.z > 0 ? 1.0f / sqrt(result.z) : HUGE_VALF;
result.w = result.w > 0 ? 1.0f / sqrt(result.w) : HUGE_VALF;
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
}
__kernel void ArithmeticSelf_ElementLogicalNot_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
@ -305,22 +161,6 @@ __kernel void ArithmeticSelf_ElementLogicalNot_NHWC4(__read_only image2d_t input
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
}
__kernel void ArithmeticSelf_ElementLogicalNot_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(2); // c/4
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
result.x = result.x > 0 || result.x < 0 ? false : true;
result.y = result.y > 0 || result.y < 0 ? false : true;
result.z = result.z > 0 || result.z < 0 ? false : true;
result.w = result.w > 0 || result.w < 0 ? false : true;
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
}
__kernel void ArithmeticSelf_ElementFloor_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
@ -337,22 +177,6 @@ __kernel void ArithmeticSelf_ElementFloor_NHWC4(__read_only image2d_t input0, __
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
}
__kernel void ArithmeticSelf_ElementFloor_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(2); // c/4
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
result.x = floor(result.x);
result.y = floor(result.y);
result.z = floor(result.z);
result.w = floor(result.w);
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
}
__kernel void ArithmeticSelf_ElementCeil_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
@ -369,22 +193,6 @@ __kernel void ArithmeticSelf_ElementCeil_NHWC4(__read_only image2d_t input0, __w
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
}
__kernel void ArithmeticSelf_ElementCeil_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(2); // c/4
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
result.x = ceil(result.x);
result.y = ceil(result.y);
result.z = ceil(result.z);
result.w = ceil(result.w);
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
}
__kernel void ArithmeticSelf_ElementRound_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
@ -401,18 +209,3 @@ __kernel void ArithmeticSelf_ElementRound_NHWC4(__read_only image2d_t input0, __
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
}
__kernel void ArithmeticSelf_ElementRound_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
int4 output_shape) {
int X = get_global_id(0); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(2); // c/4
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
result.x = round(result.x);
result.y = round(result.y);
result.z = round(result.z);
result.w = round(result.w);
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
}

View File

@ -1,131 +1,48 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
#define C4NUM 4
__kernel void SparseToDenseScalarDim0(__read_only image2d_t input, __write_only image2d_t output, float weight,
int2 input_shape, float default_value) {
FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(0, 0));
FLT4 result = {default_value, default_value, default_value, default_value};
int integer = index_input.x / C4NUM;
int decimal = (int)(index_input.x) % C4NUM;
if (decimal == 0) {
result.x = weight;
} else if (decimal == 1) {
result.y = weight;
} else if (decimal == 2) {
result.z = weight;
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void SparseToDenseScalar(__read_only image2d_t input, __global float *output, float weight, int2 inputshape,
int4 outputshape, float default_value, int stride_w, int inshapeindex1_dim) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= inputshape.x || Y >= inputshape.y) {
return;
}
FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(Y, X));
int index = 0;
if (inshapeindex1_dim == 1) {
index = ((int)index_input.x) * stride_w;
} else if (inshapeindex1_dim == 2) {
index = ((int)index_input.x) * stride_w + ((int)index_input.y);
} else if (inshapeindex1_dim == 3) {
index = ((int)index_input.x) * stride_w + ((int)index_input.y) * outputshape.w * C4NUM + ((int)index_input.z);
} else {
result.w = weight;
index = ((int)index_input.x) * outputshape.y * stride_w + ((int)index_input.y) * stride_w +
((int)index_input.z) * outputshape.w * C4NUM + (int)index_input.w;
}
WRITE_IMAGE(output, (int2)(0, integer), result);
return;
output[index] = weight;
}
__kernel void SparseToDenseScalarDim1(__read_only image2d_t input, __write_only image2d_t output, float weight,
int2 input_shape, float default_value) {
for (int i = 0; i < input_shape.x; ++i) {
FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, i));
int Y = result.x;
result.x = weight;
WRITE_IMAGE(output, (int2)(0, Y), result);
__kernel void SparseToDenseVector(__read_only image2d_t input, __global float *output, __global float *weight_vector,
int2 inputshape, int4 outputshape, float default_value, int stride_w,
int inshapeindex1_dim) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= inputshape.x || Y >= inputshape.y) {
return;
}
}
__kernel void SparseToDenseVectorDim1(__read_only image2d_t input, __write_only image2d_t output,
__global float *weight, int2 input_shape, float default_value) {
int index_weight = 0;
for (int i = 0; i < input_shape.x; ++i) {
FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, i));
int Y = result.x;
result.x = weight[index_weight++];
WRITE_IMAGE(output, (int2)(0, Y), result);
}
}
__kernel void SparseToDenseScalarDim2Shape2(__read_only image2d_t input, __write_only image2d_t output, float weight,
int2 input_shape, float default_value) {
FLT temp[8] = {default_value, default_value, default_value, default_value,
default_value, default_value, default_value, default_value};
FLT result_temp[8] = {default_value, default_value, default_value, default_value,
default_value, default_value, default_value, default_value};
int index = 0; // 0~4
int X = 0;
FLT4 index_begin = READ_IMAGE(input, smp_zero, (int2)(0, 0));
int Y = (int)index_begin.x; // N
temp[index] = index_begin.y; // c/4
for (int i = 1; i < input_shape.x && index < C4NUM; ++i) {
FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(0, i));
if ((((int)temp[index]) / C4NUM == ((int)index_input.y) / C4NUM) && (Y == (int)index_input.x)) {
index++;
if (index < C4NUM) {
temp[index] = index_input.y;
}
} else {
for (int j = 0; j <= index && index < C4NUM; ++j) {
int decimal = (int)temp[j] % C4NUM;
result_temp[decimal] = weight;
X = ((int)temp[0]) / C4NUM;
}
FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]};
WRITE_IMAGE(output, (int2)(X, Y), result);
index = 0;
Y = (int)index_input.x;
temp[0] = index_input.y;
temp[1] = temp[2] = temp[3] = default_value;
result_temp[0] = result_temp[1] = result_temp[2] = result_temp[3] = default_value;
}
}
// judge the last element for input
X = ((int)temp[0]) / C4NUM;
for (int i = 0; i <= index && index < C4NUM; ++i) {
int decimal = (int)temp[i] % C4NUM;
result_temp[decimal] = weight;
}
FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]};
WRITE_IMAGE(output, (int2)(X, Y), result);
}
__kernel void SparseToDenseVectorDim2Shape2(__read_only image2d_t input, __write_only image2d_t output,
__global float *weight, int2 input_shape, float default_value) {
FLT temp[8] = {default_value, default_value, default_value, default_value,
default_value, default_value, default_value, default_value};
FLT result_temp[8] = {default_value, default_value, default_value, default_value,
default_value, default_value, default_value, default_value};
int index = 0; // 0~4
int weight_index = 0;
int X = 0;
FLT4 index_begin = READ_IMAGE(input, smp_zero, (int2)(0, 0));
int Y = (int)index_begin.x; // N
temp[index] = index_begin.y; // c/4
for (int i = 1; i < input_shape.x && index < C4NUM; ++i) {
FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(0, i));
if ((((int)temp[index]) / C4NUM == ((int)index_input.y) / C4NUM) && (Y == (int)index_input.x)) {
index++;
if (index < C4NUM) {
temp[index] = index_input.y;
}
} else {
for (int j = 0; j <= index && index < C4NUM; ++j) {
int decimal = (int)temp[j] % C4NUM;
result_temp[decimal] = weight[weight_index++];
X = ((int)temp[0]) / C4NUM;
}
FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]};
WRITE_IMAGE(output, (int2)(X, Y), result);
index = 0;
Y = (int)index_input.x;
temp[0] = index_input.y;
temp[1] = temp[2] = temp[3] = default_value;
result_temp[0] = result_temp[1] = result_temp[2] = result_temp[3] = default_value;
}
}
// judge the last element for input
X = ((int)temp[0]) / C4NUM;
for (int i = 0; i <= index && index < C4NUM; ++i) {
int decimal = (int)temp[i] % C4NUM;
result_temp[decimal] = weight[weight_index++];
}
FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]};
WRITE_IMAGE(output, (int2)(X, Y), result);
FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(Y, X));
int index = 0;
if (inshapeindex1_dim == 1) {
index = ((int)index_input.x) * stride_w;
} else if (inshapeindex1_dim == 2) {
index = ((int)index_input.x) * stride_w + (int)index_input.y;
} else if (inshapeindex1_dim == 3) {
index = ((int)index_input.x) * stride_w + ((int)index_input.y) * outputshape.w * C4NUM + (int)index_input.z;
} else {
index = ((int)index_input.x) * outputshape.y * stride_w + ((int)index_input.y) * stride_w +
((int)index_input.z) * outputshape.w * C4NUM + (int)index_input.w;
}
output[index] = weight_vector[X];
}

View File

@ -87,24 +87,17 @@ void ArithmeticSelfOpenCLKernel::GetKernelName(std::string *kernel_name, Arithme
}
}
int ArithmeticSelfOpenCLKernel::Init() {
int ArithmeticSelfOpenCLKernel::CheckSpecs() {
if (in_tensors_[0]->shape().size() != 4 && in_tensors_[0]->shape().size() != 2) {
MS_LOG(ERROR) << " only support dim = 4 or 2 but your dim = " << in_tensors_[0]->shape().size();
return RET_ERROR;
}
auto param = reinterpret_cast<ArithmeticSelfParameter *>(this->op_parameter_);
return RET_OK;
}
std::string kernel_name = "ArithmeticSelf";
GetKernelName(&kernel_name, param);
kernel_name += "_NHWC4";
MS_LOG(DEBUG) << "execute kernel name : " << kernel_name;
std::set<std::string> build_options;
std::string source = arithmeticself_source;
std::string program_name = "ArithmeticSelf";
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
return mindspore::lite::RET_OK;
void ArithmeticSelfOpenCLKernel::SetConstArgs() {
int arg_cn = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
}
void ArithmeticSelfGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
@ -121,11 +114,8 @@ void ArithmeticSelfGetWorkGroup(const std::vector<size_t> &global, std::vector<s
local->push_back(z);
}
int ArithmeticSelfOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
void ArithmeticSelfOpenCLKernel::SetGlobalLocal() {
auto output_shape = out_tensors_[0]->shape();
cl_int4 output_shape_ = {};
uint32_t OH = 1, OW = 1, OC = 1;
if (output_shape.size() == 4) {
output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)};
@ -142,49 +132,48 @@ int ArithmeticSelfOpenCLKernel::Run() {
std::vector<size_t> local = {1, 1, 1}; // init local
std::vector<size_t> global = {OH, OW, OC};
ArithmeticSelfGetWorkGroup(global, &local, max_global[0]);
OpenCLKernel::AlignGlobalLocal(global, local);
}
int ArithmeticSelfOpenCLKernel::Prepare() {
auto param = reinterpret_cast<ArithmeticSelfParameter *>(this->op_parameter_);
std::string kernel_name = "ArithmeticSelf";
GetKernelName(&kernel_name, param);
kernel_name += "_NHWC4";
MS_LOG(DEBUG) << "execute kernel name : " << kernel_name;
std::set<std::string> build_options;
std::string source = arithmeticself_source;
std::string program_name = "ArithmeticSelf";
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
SetGlobalLocal();
SetConstArgs();
return RET_OK;
}
int ArithmeticSelfOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
int arg_cn = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
return mindspore::lite::RET_OK;
return RET_OK;
}
kernel::LiteKernel *OpenCLArithmeticSelfKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs,
OpParameter *opParameter, const lite::InnerContext *ctx,
const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) ArithmeticSelfOpenCLKernel(opParameter, inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << " new ArithmeticSelfOpenCLKernel failed ";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != mindspore::lite::RET_OK) {
MS_LOG(ERROR) << " Init kernel failed, name: ArithmeticSelf ";
delete kernel;
return nullptr;
}
return kernel;
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Abs, OpenCLArithmeticSelfKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Ceil, OpenCLArithmeticSelfKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cos, OpenCLArithmeticSelfKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Exp, OpenCLArithmeticSelfKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Floor, OpenCLArithmeticSelfKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Log, OpenCLArithmeticSelfKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalNot, OpenCLArithmeticSelfKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Round, OpenCLArithmeticSelfKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Rsqrt, OpenCLArithmeticSelfKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sin, OpenCLArithmeticSelfKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Neg, OpenCLArithmeticSelfKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sqrt, OpenCLArithmeticSelfKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Square, OpenCLArithmeticSelfKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Abs, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Ceil, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cos, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Exp, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Floor, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Log, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalNot, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Round, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Rsqrt, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sin, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Neg, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sqrt, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Square, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
} // namespace mindspore::kernel

View File

@ -32,13 +32,17 @@ class ArithmeticSelfOpenCLKernel : public OpenCLKernel {
~ArithmeticSelfOpenCLKernel() override = default;
int Init() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
int Run() override;
private:
void GetKernelName(std::string *kernel_name, ArithmeticSelfParameter *param);
cl_int4 output_shape_ = {};
cl::Kernel kernel_;
};

View File

@ -30,16 +30,7 @@ using mindspore::schema::PrimitiveType_BatchNorm;
namespace mindspore::kernel {
int BatchNormOpenCLKernel::Init() {
std::string kernel_name = "Batch_normalization_NHWC4";
std::set<std::string> build_options;
std::string source = batchnorm_source;
std::string program_name = "Batch_normalization";
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
int BatchNormOpenCLKernel::CheckSpecs() { return RET_OK; }
void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
const int max_divider = 8;
@ -55,13 +46,17 @@ void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t
local->push_back(z);
}
int BatchNormOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
void BatchNormOpenCLKernel::SetConstArgs() {
int arg_cn = 6;
auto param = reinterpret_cast<BatchNormParameter *>(this->op_parameter_);
auto input0_shape = in_tensors_[0]->shape();
auto output_shape = out_tensors_[0]->shape();
cl_int4 input_shape_ = {input0_shape[0], input0_shape[1], input0_shape[2], UP_DIV(input0_shape[3], C4NUM)};
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->epsilon_);
}
void BatchNormOpenCLKernel::SetGlobalLocal() {
auto output_shape = out_tensors_[0]->shape();
uint32_t OH = output_shape[1];
uint32_t OW = output_shape[2];
uint32_t OC = UP_DIV(output_shape[3], C4NUM);
@ -70,6 +65,25 @@ int BatchNormOpenCLKernel::Run() {
std::vector<size_t> local = {1, 1, 1}; // init local
std::vector<size_t> global = {OH, OW, OC};
BatchNormGetWorkGroup(global, &local, max_global[0]);
OpenCLKernel::AlignGlobalLocal(global, local);
}
int BatchNormOpenCLKernel::Prepare() {
std::string kernel_name = "Batch_normalization_NHWC4";
std::set<std::string> build_options;
std::string source = batchnorm_source;
std::string program_name = "Batch_normalization";
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
MS_LOG(DEBUG) << kernel_name << " Init Done!";
SetConstArgs();
SetGlobalLocal();
return RET_OK;
}
int BatchNormOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
int arg_cn = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c()); // scale
@ -77,32 +91,11 @@ int BatchNormOpenCLKernel::Run() {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c()); // mean
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->data_c()); // variance
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->epsilon_);
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
return RET_OK;
}
kernel::LiteKernel *OpenCLBatchnormKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) BatchNormOpenCLKernel(opParameter, inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << " new BatchnormOpenCLKernel failed ";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << " Init kernel failed, name: Batchnorm ";
delete kernel;
return nullptr;
}
return kernel;
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BatchNorm, OpenCLBatchnormKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BatchNorm, OpenCLBatchnormKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BatchNorm, OpenCLKernelCreator<BatchNormOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BatchNorm, OpenCLKernelCreator<BatchNormOpenCLKernel>)
} // namespace mindspore::kernel

View File

@ -31,9 +31,12 @@ class BatchNormOpenCLKernel : public OpenCLKernel {
~BatchNormOpenCLKernel() override = default;
int Init() override;
int Run() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
private:
cl::Kernel kernel_;

View File

@ -1,3 +1,4 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
@ -42,18 +43,13 @@ int CastOpenCLKernel::GetKernelName(std::string *kernel_name, CastParameter *par
return RET_OK;
}
int CastOpenCLKernel::Init() {
auto param = reinterpret_cast<CastParameter *>(this->op_parameter_);
std::string kernel_name = "Cast";
GetKernelName(&kernel_name, param);
kernel_name += "_NHWC4";
std::set<std::string> build_options;
std::string source = cast_source;
std::string program_name = "cast";
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
int CastOpenCLKernel::CheckSpecs() { return RET_OK; }
void CastOpenCLKernel::SetConstArgs() {
auto input_shape = in_tensors_[0]->shape();
cl_int4 input_shape_ = {input_shape[0], input_shape[1], input_shape[2], UP_DIV(input_shape[3], C4NUM)};
int arg_cn = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
}
void CastGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
@ -70,11 +66,8 @@ void CastGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *lo
local->push_back(z);
}
int CastOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
void CastOpenCLKernel::SetGlobalLocal() {
auto input_shape = in_tensors_[0]->shape();
cl_int4 input_shape_ = {input_shape[0], input_shape[1], input_shape[2], UP_DIV(input_shape[3], C4NUM)};
uint32_t OH = input_shape[1];
uint32_t OW = input_shape[2];
uint32_t OC = UP_DIV(input_shape[3], C4NUM);
@ -83,34 +76,35 @@ int CastOpenCLKernel::Run() {
std::vector<size_t> local = {1, 1, 1}; // init local
std::vector<size_t> global = {OH, OW, OC};
CastGetWorkGroup(global, &local, max_global[0]);
OpenCLKernel::AlignGlobalLocal(global, local);
}
int CastOpenCLKernel::Prepare() {
auto param = reinterpret_cast<CastParameter *>(this->op_parameter_);
std::string kernel_name = "Cast";
GetKernelName(&kernel_name, param);
kernel_name += "_NHWC4";
std::set<std::string> build_options;
std::string source = cast_source;
std::string program_name = "cast";
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
MS_LOG(DEBUG) << kernel_name << " Init Done!";
SetConstArgs();
SetGlobalLocal();
return RET_OK;
}
int CastOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
int arg_cn = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
return RET_OK;
}
kernel::LiteKernel *OpenCLCastKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) CastOpenCLKernel(opParameter, inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << " new CastOpenCLKernel failed ";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << " Init kernel failed, name: Cast ";
delete kernel;
return nullptr;
}
return kernel;
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cast, OpenCLCastKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Cast, OpenCLCastKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cast, OpenCLKernelCreator<CastOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Cast, OpenCLKernelCreator<CastOpenCLKernel>);
} // namespace mindspore::kernel

View File

@ -31,8 +31,11 @@ class CastOpenCLKernel : public OpenCLKernel {
: OpenCLKernel(parameter, inputs, outputs) {}
~CastOpenCLKernel() override = default;
int Prepare() override;
int Init() override;
int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
int Run() override;

View File

@ -49,7 +49,21 @@ int ConcatOpenCLKernel::RunAxis0() {
return RET_OK;
}
int ConcatOpenCLKernel::Init() {
void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
const int max_divider = 8;
const int max_x = 2, max_y = 8;
int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
int yz = max_size / x;
int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
local->clear();
local->push_back(x);
local->push_back(y);
local->push_back(z);
}
int ConcatOpenCLKernel::CheckSpecs() {
if (in_tensors_[0]->shape().size() != 4) {
MS_LOG(ERROR) << " only support dim = 4 ";
return RET_ERROR;
@ -65,6 +79,32 @@ int ConcatOpenCLKernel::Init() {
return RET_ERROR;
}
return RET_OK;
}
void ConcatOpenCLKernel::SetConstArgs() {
auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
auto output_shape = out_tensors_[0]->shape();
cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)};
int arg_cn = 2 * in_tensors_.size() + 1;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
}
void ConcatOpenCLKernel::SetGlobalLocal() {
auto output_shape = out_tensors_[0]->shape();
const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
std::vector<size_t> local = {1, 1, 1};
uint32_t OH = output_shape[0] * output_shape[1];
uint32_t OW = output_shape[2];
uint32_t OC = output_shape[3];
std::vector<size_t> global = {OH, OW, OC};
ConcatGetWorkGroup(global, &local, max_global[0]);
OpenCLKernel::AlignGlobalLocal(global, local);
}
int ConcatOpenCLKernel::Prepare() {
auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
std::string kernel_name = "Concat";
if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 6) {
kernel_name += std::to_string(in_tensors_.size()) + "inputaxis" + std::to_string(param->axis_);
@ -80,38 +120,17 @@ int ConcatOpenCLKernel::Init() {
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
MS_LOG(DEBUG) << kernel_name << " Init Done!";
SetGlobalLocal();
SetConstArgs();
return RET_OK;
}
void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
const int max_divider = 8;
const int max_x = 2, max_y = 8;
int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
int yz = max_size / x;
int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
local->clear();
local->push_back(x);
local->push_back(y);
local->push_back(z);
}
int ConcatOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
if (param->axis_ == 0) {
return RunAxis0();
}
auto output_shape = out_tensors_[0]->shape();
cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)};
const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
std::vector<size_t> local = {1, 1, 1};
uint32_t OH = output_shape_.s[0] * output_shape_.s[1];
uint32_t OW = output_shape_.s[2];
uint32_t OC = output_shape_.s[3];
std::vector<size_t> global = {OH, OW, OC};
ConcatGetWorkGroup(global, &local, max_global[0]);
if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 6) {
int arg_cn = 0;
for (int i = 0; i < in_tensors_.size(); ++i) {
@ -123,35 +142,14 @@ int ConcatOpenCLKernel::Run() {
UP_DIV(in_tensors_[i]->shape()[3], C4NUM)};
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, temp);
}
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
} else {
MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size();
return RET_ERROR;
}
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
return RET_OK;
}
kernel::LiteKernel *OpenCLConcatKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) ConcatOpenCLKernel(opParameter, inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << " new ConcatOpenCLKernel failed ";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << " Init kernel failed, name: Concat ";
delete kernel;
return nullptr;
}
return kernel;
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Concat, OpenCLConcatKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Concat, OpenCLConcatKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Concat, OpenCLKernelCreator<ConcatOpenCLKernel>);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Concat, OpenCLKernelCreator<ConcatOpenCLKernel>);
} // namespace mindspore::kernel

View File

@ -31,8 +31,11 @@ class ConcatOpenCLKernel : public OpenCLKernel {
~ConcatOpenCLKernel() override = default;
int Init() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
int Run() override;
private:

View File

@ -62,7 +62,11 @@ int FillOpenCLKernel::RunShape() {
return RET_OK;
}
int FillOpenCLKernel::Init() {
void FillOpenCLKernel::SetConstArgs() {}
void FillOpenCLKernel::SetGlobalLocal() {}
int FillOpenCLKernel::CheckSpecs() {
auto param = this->op_parameter_;
if (out_tensors_[0]->shape().size() > 4) {
@ -76,6 +80,8 @@ int FillOpenCLKernel::Init() {
return RET_OK;
}
int FillOpenCLKernel::Prepare() { return RET_OK; }
int FillOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
auto param = this->op_parameter_;
@ -88,28 +94,9 @@ int FillOpenCLKernel::Run() {
return RET_OK;
}
kernel::LiteKernel *FillOpenCLKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) FillOpenCLKernel(opParameter, inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << " new FillOpenCLKernel failed ";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << " Init kernel failed, name: fill ";
delete kernel;
return nullptr;
}
return kernel;
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Fill, FillOpenCLKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Shape, FillOpenCLKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Fill, FillOpenCLKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Shape, FillOpenCLKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Fill, OpenCLKernelCreator<FillOpenCLKernel>);
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Shape, OpenCLKernelCreator<FillOpenCLKernel>);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Fill, OpenCLKernelCreator<FillOpenCLKernel>);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Shape, OpenCLKernelCreator<FillOpenCLKernel>);
} // namespace mindspore::kernel

View File

@ -32,7 +32,11 @@ class FillOpenCLKernel : public OpenCLKernel {
~FillOpenCLKernel() override = default;
int Init() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
int Run() override;

View File

@ -90,24 +90,64 @@ int SparseToDenseOpenCLKernel::InitWeights() {
return RET_OK;
}
int SparseToDenseOpenCLKernel::Init() {
int SparseToDenseOpenCLKernel::CheckSpecs() {
if (in_tensors_[0]->shape().size() > 4 || out_tensors_[0]->shape().size() > 4) {
MS_LOG(ERROR) << "Unsupported inputdim: " << in_tensors_[0]->shape().size() << "outdim"
<< out_tensors_[0]->shape().size();
return RET_ERROR;
}
if (out_tensors_[0]->shape().size() > 2 || in_tensors_.size() < 3) {
MS_LOG(ERROR) << " only support dim <= 2 and in_tensors_.size >= 3";
return RET_ERROR;
}
if ((in_tensors_[0]->shape()[1] > 3) && (input_dim_ == 2)) {
MS_LOG(ERROR) << "in_tensors_indices shape[1] must be 1 2 or 3 && input_dim_=2 ,but your shapes is: "
<< in_tensors_[0]->shape()[1] << "your input_dim_ is: " << input_dim_;
if (input_dim_ == 2) {
if ((in_tensors_[0]->shape()[1] > 4)) {
MS_LOG(ERROR) << "in_tensors_indices shape[1] must be 1 2 or 3 && input_dim_=2 ,but your shapes is: "
<< in_tensors_[0]->shape()[1] << "your input_dim_ is: " << input_dim_;
return ERROR;
}
}
if (inshapeindex1_dim > 4) {
MS_LOG(ERROR) << "Unsupported input_indices[1] > 4: ";
return ERROR;
}
input_dim_ = in_tensors_[0]->shape().size();
weight_scalar_ = in_tensors_[2]->IsScalar();
std::string kernel_name = "SparseToDense" + std::string(weight_scalar_ ? "ScalarDim" : "VectorDim") +
std::to_string(in_tensors_[0]->shape()[1] == 1 ? 1 : input_dim_);
if (input_dim_ == 2 && in_tensors_[0]->shape()[1] != 1) {
kernel_name += "Shape" + std::to_string(in_tensors_[0]->shape()[1]);
auto param = reinterpret_cast<SparseToDenseParameter *>(op_parameter_);
if (param->validate_indices_) {
MS_LOG(ERROR) << "Unspported unordered for in_tensors_indices";
return RET_ERROR;
}
return RET_OK;
}
void SparseToDenseOpenCLKernel::SetConstArgs() {
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
Image2DInfo img_info(out_tensors_[0]);
size_t dtype = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float);
stride_w = img_info.RowPitch() / dtype;
cl_int2 input_shape = {n_ * h_, w_ * UP_DIV(c_, C4NUM)};
auto out_shape_temp = out_tensors_[0]->shape();
cl_int4 out_shape = {out_n_, out_h_, out_w_, UP_DIV(out_c_, C4NUM)};
int arg_cn = 3;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, default_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, inshapeindex1_dim);
}
void SparseToDenseOpenCLKernel::SetGlobalLocal() {
std::vector<size_t> local = {1, 1};
size_t OH = n_ * h_;
size_t OW = w_ * UP_DIV(c_, C4NUM);
std::vector<size_t> global = {OH, OW};
OpenCLKernel::AlignGlobalLocal(global, local);
}
int SparseToDenseOpenCLKernel::Prepare() {
input_dim_ = in_tensors_[0]->shape().size();
inshapeindex1_dim = in_tensors_[0]->shape()[1];
weight_scalar_ = in_tensors_[2]->IsScalar();
std::string kernel_name = "SparseToDense" + std::string(weight_scalar_ ? "Scalar" : "Vector");
std::set<std::string> build_options;
std::string source = sparse_to_dense_source;
std::string program_name = "SparseToDense";
@ -122,8 +162,10 @@ int SparseToDenseOpenCLKernel::Init() {
default_ = *reinterpret_cast<float *>(input_tensor3->data_c());
}
}
InitWeights();
InferShapeTo4D();
SetGlobalLocal();
SetConstArgs();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
@ -131,73 +173,47 @@ int SparseToDenseOpenCLKernel::Init() {
int SparseToDenseOpenCLKernel::InferShapeTo4D() {
if (in_tensors_[0]->shape().size() <= 4) {
if (in_tensors_[0]->shape().size() == 1) {
N_ = in_tensors_[0]->shape()[0];
n_ = in_tensors_[0]->shape()[0];
} else if (in_tensors_[0]->shape().size() == 2) {
N_ = in_tensors_[0]->shape()[0];
C_ = in_tensors_[0]->shape()[1];
} else if (in_tensors_[0]->shape().size() == 3) {
N_ = in_tensors_[0]->shape()[0];
W_ = in_tensors_[0]->shape()[1];
C_ = in_tensors_[0]->shape()[2];
} else {
N_ = in_tensors_[0]->shape()[0];
H_ = in_tensors_[0]->shape()[1];
W_ = in_tensors_[0]->shape()[2];
C_ = in_tensors_[0]->shape()[3];
n_ = in_tensors_[0]->shape()[0];
c_ = in_tensors_[0]->shape()[1];
}
}
if (out_tensors_[0]->shape().size() <= 4) {
if (out_tensors_[0]->shape().size() == 1) {
out_n_ = out_tensors_[0]->shape()[0];
} else if (out_tensors_[0]->shape().size() == 2) {
out_n_ = out_tensors_[0]->shape()[0];
out_c_ = out_tensors_[0]->shape()[1];
} else if (out_tensors_[0]->shape().size() == 3) {
out_n_ = out_tensors_[0]->shape()[0];
out_w_ = out_tensors_[0]->shape()[1];
out_c_ = out_tensors_[0]->shape()[2];
} else {
out_n_ = out_tensors_[0]->shape()[0];
out_h_ = out_tensors_[0]->shape()[1];
out_w_ = out_tensors_[0]->shape()[2];
out_c_ = out_tensors_[0]->shape()[3];
}
} else {
MS_LOG(ERROR) << "Unsupported inputdim: " << in_tensors_[0]->shape().size();
return RET_ERROR;
}
return RET_OK;
}
int SparseToDenseOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
InferShapeTo4D();
cl_int2 input_shape = {static_cast<cl_int>(N_ * H_), static_cast<cl_int>(W_ * UP_DIV(C_, C4NUM))};
InitOutputToDefault();
std::vector<size_t> local = {1, 1};
std::vector<size_t> global = {1, 1};
int arg_cn = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
if (weight_scalar_) {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
if (!weight_scalar_) {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_vector_, lite::opencl::MemType::BUF);
} else {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_vector_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_);
}
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, default_);
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
return RET_OK;
}
kernel::LiteKernel *SparseToDenseOpenCLKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs,
OpParameter *opParameter, const lite::InnerContext *ctx,
const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
if (inputs.empty()) {
MS_LOG(ERROR) << "Input data size must be greater than 0, but your size is " << inputs.size();
free(opParameter);
return nullptr;
}
auto *kernel = new (std::nothrow) SparseToDenseOpenCLKernel(opParameter, inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << " new HswishOpenCLKernel failed ";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << " Init kernel failed, name: hswish ";
delete kernel;
return nullptr;
}
return kernel;
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SparseToDense, SparseToDenseOpenCLKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SparseToDense, SparseToDenseOpenCLKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SparseToDense, OpenCLKernelCreator<SparseToDenseOpenCLKernel>);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SparseToDense, OpenCLKernelCreator<SparseToDenseOpenCLKernel>);
} // namespace mindspore::kernel

View File

@ -31,9 +31,12 @@ class SparseToDenseOpenCLKernel : public OpenCLKernel {
~SparseToDenseOpenCLKernel() override = default;
int Init() override;
int Prepare() override;
int Run() override;
int InitWeights() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
int CheckSpecs() override;
private:
int InferShapeTo4D();
@ -47,12 +50,19 @@ class SparseToDenseOpenCLKernel : public OpenCLKernel {
float weight_scalar_{0.f};
void *weight_vector_{nullptr};
int input_dim_{1};
int inshapeindex1_dim{1};
cl_int stride_w{1};
std::vector<int32_t> output_shape_;
size_t N_{1};
size_t H_{1};
size_t W_{1};
size_t C_{1};
cl_int n_{1};
cl_int h_{1};
cl_int w_{1};
cl_int c_{1};
cl_int out_n_{1};
cl_int out_h_{1};
cl_int out_w_{1};
cl_int out_c_{1};
};
} // namespace mindspore::kernel
#endif

View File

@ -112,7 +112,7 @@ struct Image2DInfo {
size_t RowPitch() const {
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment();
size_t row_pitch = (width + alignment - 1) / alignment * alignment * FLT4_size;
size_t row_pitch = UP_ROUND(width, alignment) * FLT4_size;
return row_pitch;
}

View File

@ -98,8 +98,8 @@ TEST_F(TestArithmeticSelfOpenCLfp16, ArithmeticSelfOpenCLFp16) {
}
arithmeticself_kernel->Init();
// to do allocate memory for inputs and outputs
for (auto &input_tensor : inputs) {
input_tensor->MallocData(allocator);
for (auto &input_tensor_ : inputs) {
input_tensor_->MallocData(allocator);
}
MS_LOG(INFO) << " initialize sub_graph ";
std::vector<kernel::LiteKernel *> kernels{arithmeticself_kernel};
@ -186,8 +186,8 @@ TEST_F(TestArithmeticSelfOpenCLCI, ArithmeticSelfRound) {
}
arithmeticself_kernel->Init();
// to do allocate memory for inputs and outputs
for (auto &input_tensor : inputs) {
input_tensor->MallocData(allocator);
for (auto &input_tensor_ : inputs) {
input_tensor_->MallocData(allocator);
}
MS_LOG(INFO) << " initialize sub_graph ";
std::vector<kernel::LiteKernel *> kernels{arithmeticself_kernel};
@ -206,7 +206,6 @@ TEST_F(TestArithmeticSelfOpenCLCI, ArithmeticSelfRound) {
}
sub_graph->Init();
MS_LOG(INFO) << " initialize input data ";
std::cout << sizeof(input_data1) / sizeof(float) << std::endl;
memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1));
std::cout << "==================output data================" << std::endl;
@ -280,8 +279,8 @@ TEST_F(TestArithmeticSelfOpenCLfp16, ArithmeticSelfdim2Fp16) {
}
arithmeticself_kernel->Init();
// to do allocate memory for inputs and outputs
for (auto &input_tensor : inputs) {
input_tensor->MallocData(allocator);
for (auto &input_tensor_ : inputs) {
input_tensor_->MallocData(allocator);
}
MS_LOG(INFO) << " initialize sub_graph ";
std::vector<kernel::LiteKernel *> kernels{arithmeticself_kernel};

View File

@ -28,6 +28,78 @@ class TestSparseToDenseOpenCLCI : public mindspore::CommonTest {
TestSparseToDenseOpenCLCI() {}
};
TEST_F(TestSparseToDenseOpenCLCI, Fp32Dim2Shape3Vector) {
MS_LOG(INFO) << " begin test ";
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
auto runtime = runtime_wrapper.GetInstance();
runtime->Init();
auto allocator = runtime->GetAllocator();
MS_LOG(INFO) << " init tensors ";
std::vector<int> input_shape1 = {6, 3};
std::vector<int> input_shape2 = {3};
std::vector<int> input_shape3 = {6};
std::vector<int> input_shape4 = {1};
float input_data1[] = {0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, 5, 0, 0, 6};
float input_data2[] = {6, 1, 10};
float input_data3[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
float input_data4[] = {0.0};
float correctOutput[] = {1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
auto data_type = kNumberTypeFloat32;
std::vector<int> output_shape = {6, 1, 10};
auto in_tensor1 = Tensor(data_type, input_shape1, Format_NHWC, lite::Tensor::VAR);
auto in_tensor2 = Tensor(data_type, input_shape2, Format_NHWC, lite::Tensor::CONST_TENSOR);
auto in_tensor3 = Tensor(data_type, input_shape3, Format_NHWC, lite::Tensor::CONST_TENSOR);
auto in_tensor4 = Tensor(data_type, input_shape4, Format_NHWC, lite::Tensor::CONST_SCALAR);
auto output_tensor = Tensor(data_type, output_shape, Format_NHWC, lite::Tensor::VAR);
// allocate memory for weights
in_tensor2.MallocData();
in_tensor3.MallocData();
in_tensor4.MallocData();
std::vector<lite::Tensor *> inputs{&in_tensor1, &in_tensor2, &in_tensor3, &in_tensor4};
std::vector<lite::Tensor *> outputs{&output_tensor};
// initialize weights
memcpy(inputs[1]->data_c(), input_data2, sizeof(input_data2));
memcpy(inputs[2]->data_c(), input_data3, sizeof(input_data3));
memcpy(inputs[3]->data_c(), input_data4, sizeof(input_data4));
MS_LOG(INFO) << " initialize tensors ";
auto param = reinterpret_cast<SparseToDenseParameter *>(malloc(sizeof(SparseToDenseParameter)));
if (param == nullptr) {
MS_LOG(INFO) << " new ActivationParameter failed ";
return;
}
auto *sparse_to_dense_kernel =
new (std::nothrow) kernel::SparseToDenseOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
if (sparse_to_dense_kernel == nullptr) {
MS_LOG(INFO) << " new kernel::SparseToDenseOpenCLKernel failed ";
delete param;
return;
}
sparse_to_dense_kernel->Init();
MS_LOG(INFO) << " initialize sub_graph ";
std::vector<kernel::LiteKernel *> kernels{sparse_to_dense_kernel};
auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({&in_tensor1}, outputs, kernels, kernels, kernels);
if (sub_graph == nullptr) {
MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
delete param;
delete sparse_to_dense_kernel;
return;
}
// to do allocate memory for inputs
in_tensor1.MallocData(allocator);
sub_graph->Init();
MS_LOG(INFO) << " initialize input data ";
memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1));
std::cout << "==================output data================" << std::endl;
sub_graph->Run();
auto *output_data_gpu = reinterpret_cast<float *>(output_tensor.data_c());
ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor.ElementsNum(), 0.0001));
delete sub_graph;
}
TEST_F(TestSparseToDenseOpenCLCI, Fp32Dim2Scalar) {
MS_LOG(INFO) << " begin test ";
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();