forked from mindspore-Ecosystem/mindspore
reconstruct code for GPU ops
This commit is contained in:
parent
ae62d15bc0
commit
d210921115
|
@ -17,22 +17,6 @@ __kernel void ArithmeticSelf_ElementAbs_NHWC4(__read_only image2d_t input0, __wr
|
|||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementAbs_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = result.x >= 0 ? result.x : -result.x;
|
||||
result.y = result.y >= 0 ? result.y : -result.y;
|
||||
result.z = result.z >= 0 ? result.z : -result.z;
|
||||
result.w = result.w >= 0 ? result.w : -result.w;
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementCos_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
|
@ -49,22 +33,6 @@ __kernel void ArithmeticSelf_ElementCos_NHWC4(__read_only image2d_t input0, __wr
|
|||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementCos_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = cos(result.x);
|
||||
result.y = cos(result.y);
|
||||
result.z = cos(result.z);
|
||||
result.w = cos(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementSin_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
|
@ -81,22 +49,6 @@ __kernel void ArithmeticSelf_ElementSin_NHWC4(__read_only image2d_t input0, __wr
|
|||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementSin_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = sin(result.x);
|
||||
result.y = sin(result.y);
|
||||
result.z = sin(result.z);
|
||||
result.w = sin(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementNeg_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
|
@ -113,22 +65,6 @@ __kernel void ArithmeticSelf_ElementNeg_NHWC4(__read_only image2d_t input0, __wr
|
|||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementNeg_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = -result.x;
|
||||
result.y = -result.y;
|
||||
result.z = -result.z;
|
||||
result.w = -result.w;
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementExp_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
|
@ -145,22 +81,6 @@ __kernel void ArithmeticSelf_ElementExp_NHWC4(__read_only image2d_t input0, __wr
|
|||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementExp_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = exp(result.x);
|
||||
result.y = exp(result.y);
|
||||
result.z = exp(result.z);
|
||||
result.w = exp(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementLog_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
|
@ -177,22 +97,6 @@ __kernel void ArithmeticSelf_ElementLog_NHWC4(__read_only image2d_t input0, __wr
|
|||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementLog_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = result.x > 0 ? log(result.x) : HUGE_VALF;
|
||||
result.y = result.y > 0 ? log(result.y) : HUGE_VALF;
|
||||
result.z = result.z > 0 ? log(result.z) : HUGE_VALF;
|
||||
result.w = result.w > 0 ? log(result.w) : HUGE_VALF;
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementSquare_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
|
@ -209,22 +113,6 @@ __kernel void ArithmeticSelf_ElementSquare_NHWC4(__read_only image2d_t input0, _
|
|||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementSquare_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = result.x * result.x;
|
||||
result.y = result.y * result.y;
|
||||
result.z = result.z * result.z;
|
||||
result.w = result.w * result.w;
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementSqrt_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
|
@ -241,22 +129,6 @@ __kernel void ArithmeticSelf_ElementSqrt_NHWC4(__read_only image2d_t input0, __w
|
|||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementSqrt_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = result.x > 0 ? sqrt(result.x) : HUGE_VALF;
|
||||
result.y = result.y > 0 ? sqrt(result.y) : HUGE_VALF;
|
||||
result.z = result.z > 0 ? sqrt(result.z) : HUGE_VALF;
|
||||
result.w = result.w > 0 ? sqrt(result.w) : HUGE_VALF;
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementRsqrt_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
|
@ -273,22 +145,6 @@ __kernel void ArithmeticSelf_ElementRsqrt_NHWC4(__read_only image2d_t input0, __
|
|||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementRsqrt_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = result.x > 0 ? 1.0f / sqrt(result.x) : HUGE_VALF;
|
||||
result.y = result.y > 0 ? 1.0f / sqrt(result.y) : HUGE_VALF;
|
||||
result.z = result.z > 0 ? 1.0f / sqrt(result.z) : HUGE_VALF;
|
||||
result.w = result.w > 0 ? 1.0f / sqrt(result.w) : HUGE_VALF;
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementLogicalNot_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
|
@ -305,22 +161,6 @@ __kernel void ArithmeticSelf_ElementLogicalNot_NHWC4(__read_only image2d_t input
|
|||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementLogicalNot_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = result.x > 0 || result.x < 0 ? false : true;
|
||||
result.y = result.y > 0 || result.y < 0 ? false : true;
|
||||
result.z = result.z > 0 || result.z < 0 ? false : true;
|
||||
result.w = result.w > 0 || result.w < 0 ? false : true;
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementFloor_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
|
@ -337,22 +177,6 @@ __kernel void ArithmeticSelf_ElementFloor_NHWC4(__read_only image2d_t input0, __
|
|||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementFloor_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = floor(result.x);
|
||||
result.y = floor(result.y);
|
||||
result.z = floor(result.z);
|
||||
result.w = floor(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementCeil_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
|
@ -369,22 +193,6 @@ __kernel void ArithmeticSelf_ElementCeil_NHWC4(__read_only image2d_t input0, __w
|
|||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementCeil_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = ceil(result.x);
|
||||
result.y = ceil(result.y);
|
||||
result.z = ceil(result.z);
|
||||
result.w = ceil(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementRound_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
|
@ -401,18 +209,3 @@ __kernel void ArithmeticSelf_ElementRound_NHWC4(__read_only image2d_t input0, __
|
|||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementRound_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = round(result.x);
|
||||
result.y = round(result.y);
|
||||
result.z = round(result.z);
|
||||
result.w = round(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
|
|
@ -1,131 +1,48 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
#define C4NUM 4
|
||||
__kernel void SparseToDenseScalarDim0(__read_only image2d_t input, __write_only image2d_t output, float weight,
|
||||
int2 input_shape, float default_value) {
|
||||
FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(0, 0));
|
||||
FLT4 result = {default_value, default_value, default_value, default_value};
|
||||
int integer = index_input.x / C4NUM;
|
||||
int decimal = (int)(index_input.x) % C4NUM;
|
||||
if (decimal == 0) {
|
||||
result.x = weight;
|
||||
} else if (decimal == 1) {
|
||||
result.y = weight;
|
||||
} else if (decimal == 2) {
|
||||
result.z = weight;
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
|
||||
__kernel void SparseToDenseScalar(__read_only image2d_t input, __global float *output, float weight, int2 inputshape,
|
||||
int4 outputshape, float default_value, int stride_w, int inshapeindex1_dim) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= inputshape.x || Y >= inputshape.y) {
|
||||
return;
|
||||
}
|
||||
FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(Y, X));
|
||||
int index = 0;
|
||||
if (inshapeindex1_dim == 1) {
|
||||
index = ((int)index_input.x) * stride_w;
|
||||
} else if (inshapeindex1_dim == 2) {
|
||||
index = ((int)index_input.x) * stride_w + ((int)index_input.y);
|
||||
} else if (inshapeindex1_dim == 3) {
|
||||
index = ((int)index_input.x) * stride_w + ((int)index_input.y) * outputshape.w * C4NUM + ((int)index_input.z);
|
||||
} else {
|
||||
result.w = weight;
|
||||
index = ((int)index_input.x) * outputshape.y * stride_w + ((int)index_input.y) * stride_w +
|
||||
((int)index_input.z) * outputshape.w * C4NUM + (int)index_input.w;
|
||||
}
|
||||
WRITE_IMAGE(output, (int2)(0, integer), result);
|
||||
return;
|
||||
output[index] = weight;
|
||||
}
|
||||
|
||||
__kernel void SparseToDenseScalarDim1(__read_only image2d_t input, __write_only image2d_t output, float weight,
|
||||
int2 input_shape, float default_value) {
|
||||
for (int i = 0; i < input_shape.x; ++i) {
|
||||
FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, i));
|
||||
int Y = result.x;
|
||||
result.x = weight;
|
||||
WRITE_IMAGE(output, (int2)(0, Y), result);
|
||||
__kernel void SparseToDenseVector(__read_only image2d_t input, __global float *output, __global float *weight_vector,
|
||||
int2 inputshape, int4 outputshape, float default_value, int stride_w,
|
||||
int inshapeindex1_dim) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= inputshape.x || Y >= inputshape.y) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void SparseToDenseVectorDim1(__read_only image2d_t input, __write_only image2d_t output,
|
||||
__global float *weight, int2 input_shape, float default_value) {
|
||||
int index_weight = 0;
|
||||
for (int i = 0; i < input_shape.x; ++i) {
|
||||
FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, i));
|
||||
int Y = result.x;
|
||||
result.x = weight[index_weight++];
|
||||
WRITE_IMAGE(output, (int2)(0, Y), result);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void SparseToDenseScalarDim2Shape2(__read_only image2d_t input, __write_only image2d_t output, float weight,
|
||||
int2 input_shape, float default_value) {
|
||||
FLT temp[8] = {default_value, default_value, default_value, default_value,
|
||||
default_value, default_value, default_value, default_value};
|
||||
FLT result_temp[8] = {default_value, default_value, default_value, default_value,
|
||||
default_value, default_value, default_value, default_value};
|
||||
int index = 0; // 0~4
|
||||
int X = 0;
|
||||
FLT4 index_begin = READ_IMAGE(input, smp_zero, (int2)(0, 0));
|
||||
int Y = (int)index_begin.x; // N
|
||||
temp[index] = index_begin.y; // c/4
|
||||
for (int i = 1; i < input_shape.x && index < C4NUM; ++i) {
|
||||
FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(0, i));
|
||||
if ((((int)temp[index]) / C4NUM == ((int)index_input.y) / C4NUM) && (Y == (int)index_input.x)) {
|
||||
index++;
|
||||
if (index < C4NUM) {
|
||||
temp[index] = index_input.y;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j <= index && index < C4NUM; ++j) {
|
||||
int decimal = (int)temp[j] % C4NUM;
|
||||
result_temp[decimal] = weight;
|
||||
X = ((int)temp[0]) / C4NUM;
|
||||
}
|
||||
FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]};
|
||||
WRITE_IMAGE(output, (int2)(X, Y), result);
|
||||
index = 0;
|
||||
Y = (int)index_input.x;
|
||||
temp[0] = index_input.y;
|
||||
temp[1] = temp[2] = temp[3] = default_value;
|
||||
result_temp[0] = result_temp[1] = result_temp[2] = result_temp[3] = default_value;
|
||||
}
|
||||
}
|
||||
|
||||
// judge the last element for input
|
||||
X = ((int)temp[0]) / C4NUM;
|
||||
for (int i = 0; i <= index && index < C4NUM; ++i) {
|
||||
int decimal = (int)temp[i] % C4NUM;
|
||||
result_temp[decimal] = weight;
|
||||
}
|
||||
FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]};
|
||||
WRITE_IMAGE(output, (int2)(X, Y), result);
|
||||
}
|
||||
|
||||
__kernel void SparseToDenseVectorDim2Shape2(__read_only image2d_t input, __write_only image2d_t output,
|
||||
__global float *weight, int2 input_shape, float default_value) {
|
||||
FLT temp[8] = {default_value, default_value, default_value, default_value,
|
||||
default_value, default_value, default_value, default_value};
|
||||
FLT result_temp[8] = {default_value, default_value, default_value, default_value,
|
||||
default_value, default_value, default_value, default_value};
|
||||
int index = 0; // 0~4
|
||||
int weight_index = 0;
|
||||
int X = 0;
|
||||
FLT4 index_begin = READ_IMAGE(input, smp_zero, (int2)(0, 0));
|
||||
int Y = (int)index_begin.x; // N
|
||||
temp[index] = index_begin.y; // c/4
|
||||
for (int i = 1; i < input_shape.x && index < C4NUM; ++i) {
|
||||
FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(0, i));
|
||||
if ((((int)temp[index]) / C4NUM == ((int)index_input.y) / C4NUM) && (Y == (int)index_input.x)) {
|
||||
index++;
|
||||
if (index < C4NUM) {
|
||||
temp[index] = index_input.y;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j <= index && index < C4NUM; ++j) {
|
||||
int decimal = (int)temp[j] % C4NUM;
|
||||
result_temp[decimal] = weight[weight_index++];
|
||||
X = ((int)temp[0]) / C4NUM;
|
||||
}
|
||||
FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]};
|
||||
WRITE_IMAGE(output, (int2)(X, Y), result);
|
||||
index = 0;
|
||||
Y = (int)index_input.x;
|
||||
temp[0] = index_input.y;
|
||||
temp[1] = temp[2] = temp[3] = default_value;
|
||||
result_temp[0] = result_temp[1] = result_temp[2] = result_temp[3] = default_value;
|
||||
}
|
||||
}
|
||||
|
||||
// judge the last element for input
|
||||
X = ((int)temp[0]) / C4NUM;
|
||||
for (int i = 0; i <= index && index < C4NUM; ++i) {
|
||||
int decimal = (int)temp[i] % C4NUM;
|
||||
result_temp[decimal] = weight[weight_index++];
|
||||
}
|
||||
FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]};
|
||||
WRITE_IMAGE(output, (int2)(X, Y), result);
|
||||
FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(Y, X));
|
||||
int index = 0;
|
||||
if (inshapeindex1_dim == 1) {
|
||||
index = ((int)index_input.x) * stride_w;
|
||||
} else if (inshapeindex1_dim == 2) {
|
||||
index = ((int)index_input.x) * stride_w + (int)index_input.y;
|
||||
} else if (inshapeindex1_dim == 3) {
|
||||
index = ((int)index_input.x) * stride_w + ((int)index_input.y) * outputshape.w * C4NUM + (int)index_input.z;
|
||||
} else {
|
||||
index = ((int)index_input.x) * outputshape.y * stride_w + ((int)index_input.y) * stride_w +
|
||||
((int)index_input.z) * outputshape.w * C4NUM + (int)index_input.w;
|
||||
}
|
||||
output[index] = weight_vector[X];
|
||||
}
|
||||
|
|
|
@ -87,24 +87,17 @@ void ArithmeticSelfOpenCLKernel::GetKernelName(std::string *kernel_name, Arithme
|
|||
}
|
||||
}
|
||||
|
||||
int ArithmeticSelfOpenCLKernel::Init() {
|
||||
int ArithmeticSelfOpenCLKernel::CheckSpecs() {
|
||||
if (in_tensors_[0]->shape().size() != 4 && in_tensors_[0]->shape().size() != 2) {
|
||||
MS_LOG(ERROR) << " only support dim = 4 or 2 but your dim = " << in_tensors_[0]->shape().size();
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto param = reinterpret_cast<ArithmeticSelfParameter *>(this->op_parameter_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
std::string kernel_name = "ArithmeticSelf";
|
||||
GetKernelName(&kernel_name, param);
|
||||
kernel_name += "_NHWC4";
|
||||
MS_LOG(DEBUG) << "execute kernel name : " << kernel_name;
|
||||
std::set<std::string> build_options;
|
||||
std::string source = arithmeticself_source;
|
||||
std::string program_name = "ArithmeticSelf";
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
|
||||
return mindspore::lite::RET_OK;
|
||||
void ArithmeticSelfOpenCLKernel::SetConstArgs() {
|
||||
int arg_cn = 2;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
|
||||
}
|
||||
|
||||
void ArithmeticSelfGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
|
||||
|
@ -121,11 +114,8 @@ void ArithmeticSelfGetWorkGroup(const std::vector<size_t> &global, std::vector<s
|
|||
local->push_back(z);
|
||||
}
|
||||
|
||||
int ArithmeticSelfOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
|
||||
void ArithmeticSelfOpenCLKernel::SetGlobalLocal() {
|
||||
auto output_shape = out_tensors_[0]->shape();
|
||||
cl_int4 output_shape_ = {};
|
||||
uint32_t OH = 1, OW = 1, OC = 1;
|
||||
if (output_shape.size() == 4) {
|
||||
output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)};
|
||||
|
@ -142,49 +132,48 @@ int ArithmeticSelfOpenCLKernel::Run() {
|
|||
std::vector<size_t> local = {1, 1, 1}; // init local
|
||||
std::vector<size_t> global = {OH, OW, OC};
|
||||
ArithmeticSelfGetWorkGroup(global, &local, max_global[0]);
|
||||
OpenCLKernel::AlignGlobalLocal(global, local);
|
||||
}
|
||||
|
||||
int ArithmeticSelfOpenCLKernel::Prepare() {
|
||||
auto param = reinterpret_cast<ArithmeticSelfParameter *>(this->op_parameter_);
|
||||
std::string kernel_name = "ArithmeticSelf";
|
||||
GetKernelName(&kernel_name, param);
|
||||
kernel_name += "_NHWC4";
|
||||
MS_LOG(DEBUG) << "execute kernel name : " << kernel_name;
|
||||
std::set<std::string> build_options;
|
||||
std::string source = arithmeticself_source;
|
||||
std::string program_name = "ArithmeticSelf";
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
SetGlobalLocal();
|
||||
SetConstArgs();
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ArithmeticSelfOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
int arg_cn = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
|
||||
|
||||
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
|
||||
return mindspore::lite::RET_OK;
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *OpenCLArithmeticSelfKernelCreator(const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs,
|
||||
OpParameter *opParameter, const lite::InnerContext *ctx,
|
||||
const kernel::KernelKey &desc,
|
||||
const mindspore::lite::PrimitiveC *primitive) {
|
||||
auto *kernel = new (std::nothrow) ArithmeticSelfOpenCLKernel(opParameter, inputs, outputs);
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << " new ArithmeticSelfOpenCLKernel failed ";
|
||||
free(opParameter);
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = kernel->Init();
|
||||
if (ret != mindspore::lite::RET_OK) {
|
||||
MS_LOG(ERROR) << " Init kernel failed, name: ArithmeticSelf ";
|
||||
delete kernel;
|
||||
return nullptr;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Abs, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Ceil, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cos, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Exp, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Floor, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Log, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalNot, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Round, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Rsqrt, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sin, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Neg, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sqrt, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Square, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Abs, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Ceil, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cos, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Exp, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Floor, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Log, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalNot, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Round, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Rsqrt, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sin, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Neg, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sqrt, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Square, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
|
||||
|
||||
} // namespace mindspore::kernel
|
||||
|
|
|
@ -32,13 +32,17 @@ class ArithmeticSelfOpenCLKernel : public OpenCLKernel {
|
|||
|
||||
~ArithmeticSelfOpenCLKernel() override = default;
|
||||
|
||||
int Init() override;
|
||||
int Prepare() override;
|
||||
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
int Run() override;
|
||||
|
||||
private:
|
||||
void GetKernelName(std::string *kernel_name, ArithmeticSelfParameter *param);
|
||||
|
||||
cl_int4 output_shape_ = {};
|
||||
cl::Kernel kernel_;
|
||||
};
|
||||
|
||||
|
|
|
@ -30,16 +30,7 @@ using mindspore::schema::PrimitiveType_BatchNorm;
|
|||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
int BatchNormOpenCLKernel::Init() {
|
||||
std::string kernel_name = "Batch_normalization_NHWC4";
|
||||
std::set<std::string> build_options;
|
||||
std::string source = batchnorm_source;
|
||||
std::string program_name = "Batch_normalization";
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
int BatchNormOpenCLKernel::CheckSpecs() { return RET_OK; }
|
||||
|
||||
void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
|
||||
const int max_divider = 8;
|
||||
|
@ -55,13 +46,17 @@ void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t
|
|||
local->push_back(z);
|
||||
}
|
||||
|
||||
int BatchNormOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
void BatchNormOpenCLKernel::SetConstArgs() {
|
||||
int arg_cn = 6;
|
||||
auto param = reinterpret_cast<BatchNormParameter *>(this->op_parameter_);
|
||||
auto input0_shape = in_tensors_[0]->shape();
|
||||
auto output_shape = out_tensors_[0]->shape();
|
||||
cl_int4 input_shape_ = {input0_shape[0], input0_shape[1], input0_shape[2], UP_DIV(input0_shape[3], C4NUM)};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->epsilon_);
|
||||
}
|
||||
|
||||
void BatchNormOpenCLKernel::SetGlobalLocal() {
|
||||
auto output_shape = out_tensors_[0]->shape();
|
||||
uint32_t OH = output_shape[1];
|
||||
uint32_t OW = output_shape[2];
|
||||
uint32_t OC = UP_DIV(output_shape[3], C4NUM);
|
||||
|
@ -70,6 +65,25 @@ int BatchNormOpenCLKernel::Run() {
|
|||
std::vector<size_t> local = {1, 1, 1}; // init local
|
||||
std::vector<size_t> global = {OH, OW, OC};
|
||||
BatchNormGetWorkGroup(global, &local, max_global[0]);
|
||||
OpenCLKernel::AlignGlobalLocal(global, local);
|
||||
}
|
||||
|
||||
int BatchNormOpenCLKernel::Prepare() {
|
||||
std::string kernel_name = "Batch_normalization_NHWC4";
|
||||
std::set<std::string> build_options;
|
||||
std::string source = batchnorm_source;
|
||||
std::string program_name = "Batch_normalization";
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
SetConstArgs();
|
||||
SetGlobalLocal();
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int BatchNormOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
int arg_cn = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c()); // scale
|
||||
|
@ -77,32 +91,11 @@ int BatchNormOpenCLKernel::Run() {
|
|||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c()); // mean
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->data_c()); // variance
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->epsilon_);
|
||||
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *OpenCLBatchnormKernelCreator(const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
|
||||
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
|
||||
const mindspore::lite::PrimitiveC *primitive) {
|
||||
auto *kernel = new (std::nothrow) BatchNormOpenCLKernel(opParameter, inputs, outputs);
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << " new BatchnormOpenCLKernel failed ";
|
||||
free(opParameter);
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = kernel->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << " Init kernel failed, name: Batchnorm ";
|
||||
delete kernel;
|
||||
return nullptr;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BatchNorm, OpenCLBatchnormKernelCreator);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BatchNorm, OpenCLBatchnormKernelCreator);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BatchNorm, OpenCLKernelCreator<BatchNormOpenCLKernel>)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BatchNorm, OpenCLKernelCreator<BatchNormOpenCLKernel>)
|
||||
} // namespace mindspore::kernel
|
||||
|
|
|
@ -31,9 +31,12 @@ class BatchNormOpenCLKernel : public OpenCLKernel {
|
|||
|
||||
~BatchNormOpenCLKernel() override = default;
|
||||
|
||||
int Init() override;
|
||||
|
||||
int Run() override;
|
||||
int Prepare() override;
|
||||
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
private:
|
||||
cl::Kernel kernel_;
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
|
@ -42,18 +43,13 @@ int CastOpenCLKernel::GetKernelName(std::string *kernel_name, CastParameter *par
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int CastOpenCLKernel::Init() {
|
||||
auto param = reinterpret_cast<CastParameter *>(this->op_parameter_);
|
||||
std::string kernel_name = "Cast";
|
||||
GetKernelName(&kernel_name, param);
|
||||
kernel_name += "_NHWC4";
|
||||
std::set<std::string> build_options;
|
||||
std::string source = cast_source;
|
||||
std::string program_name = "cast";
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
int CastOpenCLKernel::CheckSpecs() { return RET_OK; }
|
||||
|
||||
void CastOpenCLKernel::SetConstArgs() {
|
||||
auto input_shape = in_tensors_[0]->shape();
|
||||
cl_int4 input_shape_ = {input_shape[0], input_shape[1], input_shape[2], UP_DIV(input_shape[3], C4NUM)};
|
||||
int arg_cn = 2;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
|
||||
}
|
||||
|
||||
void CastGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
|
||||
|
@ -70,11 +66,8 @@ void CastGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *lo
|
|||
local->push_back(z);
|
||||
}
|
||||
|
||||
int CastOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
void CastOpenCLKernel::SetGlobalLocal() {
|
||||
auto input_shape = in_tensors_[0]->shape();
|
||||
cl_int4 input_shape_ = {input_shape[0], input_shape[1], input_shape[2], UP_DIV(input_shape[3], C4NUM)};
|
||||
|
||||
uint32_t OH = input_shape[1];
|
||||
uint32_t OW = input_shape[2];
|
||||
uint32_t OC = UP_DIV(input_shape[3], C4NUM);
|
||||
|
@ -83,34 +76,35 @@ int CastOpenCLKernel::Run() {
|
|||
std::vector<size_t> local = {1, 1, 1}; // init local
|
||||
std::vector<size_t> global = {OH, OW, OC};
|
||||
CastGetWorkGroup(global, &local, max_global[0]);
|
||||
OpenCLKernel::AlignGlobalLocal(global, local);
|
||||
}
|
||||
|
||||
int CastOpenCLKernel::Prepare() {
|
||||
auto param = reinterpret_cast<CastParameter *>(this->op_parameter_);
|
||||
std::string kernel_name = "Cast";
|
||||
GetKernelName(&kernel_name, param);
|
||||
kernel_name += "_NHWC4";
|
||||
std::set<std::string> build_options;
|
||||
std::string source = cast_source;
|
||||
std::string program_name = "cast";
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
SetConstArgs();
|
||||
SetGlobalLocal();
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int CastOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
int arg_cn = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
|
||||
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *OpenCLCastKernelCreator(const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
|
||||
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
|
||||
const mindspore::lite::PrimitiveC *primitive) {
|
||||
auto *kernel = new (std::nothrow) CastOpenCLKernel(opParameter, inputs, outputs);
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << " new CastOpenCLKernel failed ";
|
||||
free(opParameter);
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = kernel->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << " Init kernel failed, name: Cast ";
|
||||
delete kernel;
|
||||
return nullptr;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cast, OpenCLCastKernelCreator);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Cast, OpenCLCastKernelCreator);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cast, OpenCLKernelCreator<CastOpenCLKernel>)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Cast, OpenCLKernelCreator<CastOpenCLKernel>);
|
||||
} // namespace mindspore::kernel
|
||||
|
|
|
@ -31,8 +31,11 @@ class CastOpenCLKernel : public OpenCLKernel {
|
|||
: OpenCLKernel(parameter, inputs, outputs) {}
|
||||
|
||||
~CastOpenCLKernel() override = default;
|
||||
int Prepare() override;
|
||||
|
||||
int Init() override;
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
int Run() override;
|
||||
|
||||
|
|
|
@ -49,7 +49,21 @@ int ConcatOpenCLKernel::RunAxis0() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConcatOpenCLKernel::Init() {
|
||||
void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
|
||||
const int max_divider = 8;
|
||||
const int max_x = 2, max_y = 8;
|
||||
int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
|
||||
int yz = max_size / x;
|
||||
int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
|
||||
int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
|
||||
|
||||
local->clear();
|
||||
local->push_back(x);
|
||||
local->push_back(y);
|
||||
local->push_back(z);
|
||||
}
|
||||
|
||||
int ConcatOpenCLKernel::CheckSpecs() {
|
||||
if (in_tensors_[0]->shape().size() != 4) {
|
||||
MS_LOG(ERROR) << " only support dim = 4 ";
|
||||
return RET_ERROR;
|
||||
|
@ -65,6 +79,32 @@ int ConcatOpenCLKernel::Init() {
|
|||
return RET_ERROR;
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void ConcatOpenCLKernel::SetConstArgs() {
|
||||
auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
|
||||
auto output_shape = out_tensors_[0]->shape();
|
||||
cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)};
|
||||
int arg_cn = 2 * in_tensors_.size() + 1;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
|
||||
}
|
||||
|
||||
void ConcatOpenCLKernel::SetGlobalLocal() {
|
||||
auto output_shape = out_tensors_[0]->shape();
|
||||
const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
|
||||
std::vector<size_t> local = {1, 1, 1};
|
||||
uint32_t OH = output_shape[0] * output_shape[1];
|
||||
uint32_t OW = output_shape[2];
|
||||
uint32_t OC = output_shape[3];
|
||||
std::vector<size_t> global = {OH, OW, OC};
|
||||
ConcatGetWorkGroup(global, &local, max_global[0]);
|
||||
OpenCLKernel::AlignGlobalLocal(global, local);
|
||||
}
|
||||
|
||||
int ConcatOpenCLKernel::Prepare() {
|
||||
auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
|
||||
std::string kernel_name = "Concat";
|
||||
if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 6) {
|
||||
kernel_name += std::to_string(in_tensors_.size()) + "inputaxis" + std::to_string(param->axis_);
|
||||
|
@ -80,38 +120,17 @@ int ConcatOpenCLKernel::Init() {
|
|||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
SetGlobalLocal();
|
||||
SetConstArgs();
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
|
||||
const int max_divider = 8;
|
||||
const int max_x = 2, max_y = 8;
|
||||
int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
|
||||
int yz = max_size / x;
|
||||
int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
|
||||
int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
|
||||
|
||||
local->clear();
|
||||
local->push_back(x);
|
||||
local->push_back(y);
|
||||
local->push_back(z);
|
||||
}
|
||||
|
||||
int ConcatOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
|
||||
if (param->axis_ == 0) {
|
||||
return RunAxis0();
|
||||
}
|
||||
auto output_shape = out_tensors_[0]->shape();
|
||||
cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)};
|
||||
const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
|
||||
std::vector<size_t> local = {1, 1, 1};
|
||||
uint32_t OH = output_shape_.s[0] * output_shape_.s[1];
|
||||
uint32_t OW = output_shape_.s[2];
|
||||
uint32_t OC = output_shape_.s[3];
|
||||
std::vector<size_t> global = {OH, OW, OC};
|
||||
ConcatGetWorkGroup(global, &local, max_global[0]);
|
||||
if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 6) {
|
||||
int arg_cn = 0;
|
||||
for (int i = 0; i < in_tensors_.size(); ++i) {
|
||||
|
@ -123,35 +142,14 @@ int ConcatOpenCLKernel::Run() {
|
|||
UP_DIV(in_tensors_[i]->shape()[3], C4NUM)};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, temp);
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
|
||||
} else {
|
||||
MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size();
|
||||
return RET_ERROR;
|
||||
}
|
||||
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *OpenCLConcatKernelCreator(const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
|
||||
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
|
||||
const mindspore::lite::PrimitiveC *primitive) {
|
||||
auto *kernel = new (std::nothrow) ConcatOpenCLKernel(opParameter, inputs, outputs);
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << " new ConcatOpenCLKernel failed ";
|
||||
free(opParameter);
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = kernel->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << " Init kernel failed, name: Concat ";
|
||||
delete kernel;
|
||||
return nullptr;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Concat, OpenCLConcatKernelCreator);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Concat, OpenCLConcatKernelCreator);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Concat, OpenCLKernelCreator<ConcatOpenCLKernel>);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Concat, OpenCLKernelCreator<ConcatOpenCLKernel>);
|
||||
} // namespace mindspore::kernel
|
||||
|
|
|
@ -31,8 +31,11 @@ class ConcatOpenCLKernel : public OpenCLKernel {
|
|||
|
||||
~ConcatOpenCLKernel() override = default;
|
||||
|
||||
int Init() override;
|
||||
int Prepare() override;
|
||||
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int Run() override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -62,7 +62,11 @@ int FillOpenCLKernel::RunShape() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int FillOpenCLKernel::Init() {
|
||||
void FillOpenCLKernel::SetConstArgs() {}
|
||||
|
||||
void FillOpenCLKernel::SetGlobalLocal() {}
|
||||
|
||||
int FillOpenCLKernel::CheckSpecs() {
|
||||
auto param = this->op_parameter_;
|
||||
|
||||
if (out_tensors_[0]->shape().size() > 4) {
|
||||
|
@ -76,6 +80,8 @@ int FillOpenCLKernel::Init() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int FillOpenCLKernel::Prepare() { return RET_OK; }
|
||||
|
||||
int FillOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
auto param = this->op_parameter_;
|
||||
|
@ -88,28 +94,9 @@ int FillOpenCLKernel::Run() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *FillOpenCLKernelCreator(const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
|
||||
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
|
||||
const mindspore::lite::PrimitiveC *primitive) {
|
||||
auto *kernel = new (std::nothrow) FillOpenCLKernel(opParameter, inputs, outputs);
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << " new FillOpenCLKernel failed ";
|
||||
free(opParameter);
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = kernel->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << " Init kernel failed, name: fill ";
|
||||
delete kernel;
|
||||
return nullptr;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Fill, FillOpenCLKernelCreator);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Shape, FillOpenCLKernelCreator);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Fill, FillOpenCLKernelCreator);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Shape, FillOpenCLKernelCreator);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Fill, OpenCLKernelCreator<FillOpenCLKernel>);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Shape, OpenCLKernelCreator<FillOpenCLKernel>);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Fill, OpenCLKernelCreator<FillOpenCLKernel>);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Shape, OpenCLKernelCreator<FillOpenCLKernel>);
|
||||
|
||||
} // namespace mindspore::kernel
|
||||
|
|
|
@ -32,7 +32,11 @@ class FillOpenCLKernel : public OpenCLKernel {
|
|||
|
||||
~FillOpenCLKernel() override = default;
|
||||
|
||||
int Init() override;
|
||||
int Prepare() override;
|
||||
|
||||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
|
||||
int Run() override;
|
||||
|
||||
|
|
|
@ -90,24 +90,64 @@ int SparseToDenseOpenCLKernel::InitWeights() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int SparseToDenseOpenCLKernel::Init() {
|
||||
int SparseToDenseOpenCLKernel::CheckSpecs() {
|
||||
if (in_tensors_[0]->shape().size() > 4 || out_tensors_[0]->shape().size() > 4) {
|
||||
MS_LOG(ERROR) << "Unsupported inputdim: " << in_tensors_[0]->shape().size() << "outdim"
|
||||
<< out_tensors_[0]->shape().size();
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (out_tensors_[0]->shape().size() > 2 || in_tensors_.size() < 3) {
|
||||
MS_LOG(ERROR) << " only support dim <= 2 and in_tensors_.size >= 3";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if ((in_tensors_[0]->shape()[1] > 3) && (input_dim_ == 2)) {
|
||||
MS_LOG(ERROR) << "in_tensors_indices shape[1] must be 1 2 or 3 && input_dim_=2 ,but your shapes is: "
|
||||
<< in_tensors_[0]->shape()[1] << "your input_dim_ is: " << input_dim_;
|
||||
if (input_dim_ == 2) {
|
||||
if ((in_tensors_[0]->shape()[1] > 4)) {
|
||||
MS_LOG(ERROR) << "in_tensors_indices shape[1] must be 1 2 or 3 && input_dim_=2 ,but your shapes is: "
|
||||
<< in_tensors_[0]->shape()[1] << "your input_dim_ is: " << input_dim_;
|
||||
return ERROR;
|
||||
}
|
||||
}
|
||||
if (inshapeindex1_dim > 4) {
|
||||
MS_LOG(ERROR) << "Unsupported input_indices[1] > 4: ";
|
||||
return ERROR;
|
||||
}
|
||||
input_dim_ = in_tensors_[0]->shape().size();
|
||||
weight_scalar_ = in_tensors_[2]->IsScalar();
|
||||
std::string kernel_name = "SparseToDense" + std::string(weight_scalar_ ? "ScalarDim" : "VectorDim") +
|
||||
std::to_string(in_tensors_[0]->shape()[1] == 1 ? 1 : input_dim_);
|
||||
if (input_dim_ == 2 && in_tensors_[0]->shape()[1] != 1) {
|
||||
kernel_name += "Shape" + std::to_string(in_tensors_[0]->shape()[1]);
|
||||
auto param = reinterpret_cast<SparseToDenseParameter *>(op_parameter_);
|
||||
if (param->validate_indices_) {
|
||||
MS_LOG(ERROR) << "Unspported unordered for in_tensors_indices";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void SparseToDenseOpenCLKernel::SetConstArgs() {
|
||||
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
|
||||
Image2DInfo img_info(out_tensors_[0]);
|
||||
size_t dtype = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float);
|
||||
stride_w = img_info.RowPitch() / dtype;
|
||||
cl_int2 input_shape = {n_ * h_, w_ * UP_DIV(c_, C4NUM)};
|
||||
auto out_shape_temp = out_tensors_[0]->shape();
|
||||
cl_int4 out_shape = {out_n_, out_h_, out_w_, UP_DIV(out_c_, C4NUM)};
|
||||
int arg_cn = 3;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, default_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, inshapeindex1_dim);
|
||||
}
|
||||
|
||||
void SparseToDenseOpenCLKernel::SetGlobalLocal() {
|
||||
std::vector<size_t> local = {1, 1};
|
||||
size_t OH = n_ * h_;
|
||||
size_t OW = w_ * UP_DIV(c_, C4NUM);
|
||||
std::vector<size_t> global = {OH, OW};
|
||||
OpenCLKernel::AlignGlobalLocal(global, local);
|
||||
}
|
||||
|
||||
int SparseToDenseOpenCLKernel::Prepare() {
|
||||
input_dim_ = in_tensors_[0]->shape().size();
|
||||
inshapeindex1_dim = in_tensors_[0]->shape()[1];
|
||||
weight_scalar_ = in_tensors_[2]->IsScalar();
|
||||
std::string kernel_name = "SparseToDense" + std::string(weight_scalar_ ? "Scalar" : "Vector");
|
||||
std::set<std::string> build_options;
|
||||
std::string source = sparse_to_dense_source;
|
||||
std::string program_name = "SparseToDense";
|
||||
|
@ -122,8 +162,10 @@ int SparseToDenseOpenCLKernel::Init() {
|
|||
default_ = *reinterpret_cast<float *>(input_tensor3->data_c());
|
||||
}
|
||||
}
|
||||
|
||||
InitWeights();
|
||||
InferShapeTo4D();
|
||||
SetGlobalLocal();
|
||||
SetConstArgs();
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -131,73 +173,47 @@ int SparseToDenseOpenCLKernel::Init() {
|
|||
int SparseToDenseOpenCLKernel::InferShapeTo4D() {
|
||||
if (in_tensors_[0]->shape().size() <= 4) {
|
||||
if (in_tensors_[0]->shape().size() == 1) {
|
||||
N_ = in_tensors_[0]->shape()[0];
|
||||
n_ = in_tensors_[0]->shape()[0];
|
||||
} else if (in_tensors_[0]->shape().size() == 2) {
|
||||
N_ = in_tensors_[0]->shape()[0];
|
||||
C_ = in_tensors_[0]->shape()[1];
|
||||
} else if (in_tensors_[0]->shape().size() == 3) {
|
||||
N_ = in_tensors_[0]->shape()[0];
|
||||
W_ = in_tensors_[0]->shape()[1];
|
||||
C_ = in_tensors_[0]->shape()[2];
|
||||
} else {
|
||||
N_ = in_tensors_[0]->shape()[0];
|
||||
H_ = in_tensors_[0]->shape()[1];
|
||||
W_ = in_tensors_[0]->shape()[2];
|
||||
C_ = in_tensors_[0]->shape()[3];
|
||||
n_ = in_tensors_[0]->shape()[0];
|
||||
c_ = in_tensors_[0]->shape()[1];
|
||||
}
|
||||
}
|
||||
if (out_tensors_[0]->shape().size() <= 4) {
|
||||
if (out_tensors_[0]->shape().size() == 1) {
|
||||
out_n_ = out_tensors_[0]->shape()[0];
|
||||
} else if (out_tensors_[0]->shape().size() == 2) {
|
||||
out_n_ = out_tensors_[0]->shape()[0];
|
||||
out_c_ = out_tensors_[0]->shape()[1];
|
||||
} else if (out_tensors_[0]->shape().size() == 3) {
|
||||
out_n_ = out_tensors_[0]->shape()[0];
|
||||
out_w_ = out_tensors_[0]->shape()[1];
|
||||
out_c_ = out_tensors_[0]->shape()[2];
|
||||
} else {
|
||||
out_n_ = out_tensors_[0]->shape()[0];
|
||||
out_h_ = out_tensors_[0]->shape()[1];
|
||||
out_w_ = out_tensors_[0]->shape()[2];
|
||||
out_c_ = out_tensors_[0]->shape()[3];
|
||||
}
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Unsupported inputdim: " << in_tensors_[0]->shape().size();
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int SparseToDenseOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
InferShapeTo4D();
|
||||
cl_int2 input_shape = {static_cast<cl_int>(N_ * H_), static_cast<cl_int>(W_ * UP_DIV(C_, C4NUM))};
|
||||
InitOutputToDefault();
|
||||
std::vector<size_t> local = {1, 1};
|
||||
std::vector<size_t> global = {1, 1};
|
||||
int arg_cn = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
|
||||
if (weight_scalar_) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
|
||||
if (!weight_scalar_) {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_vector_, lite::opencl::MemType::BUF);
|
||||
} else {
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_vector_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_);
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, default_);
|
||||
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *SparseToDenseOpenCLKernelCreator(const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs,
|
||||
OpParameter *opParameter, const lite::InnerContext *ctx,
|
||||
const kernel::KernelKey &desc,
|
||||
const mindspore::lite::PrimitiveC *primitive) {
|
||||
if (inputs.empty()) {
|
||||
MS_LOG(ERROR) << "Input data size must be greater than 0, but your size is " << inputs.size();
|
||||
free(opParameter);
|
||||
return nullptr;
|
||||
}
|
||||
auto *kernel = new (std::nothrow) SparseToDenseOpenCLKernel(opParameter, inputs, outputs);
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << " new HswishOpenCLKernel failed ";
|
||||
free(opParameter);
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = kernel->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << " Init kernel failed, name: hswish ";
|
||||
delete kernel;
|
||||
return nullptr;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SparseToDense, SparseToDenseOpenCLKernelCreator);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SparseToDense, SparseToDenseOpenCLKernelCreator);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SparseToDense, OpenCLKernelCreator<SparseToDenseOpenCLKernel>);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SparseToDense, OpenCLKernelCreator<SparseToDenseOpenCLKernel>);
|
||||
} // namespace mindspore::kernel
|
||||
|
|
|
@ -31,9 +31,12 @@ class SparseToDenseOpenCLKernel : public OpenCLKernel {
|
|||
|
||||
~SparseToDenseOpenCLKernel() override = default;
|
||||
|
||||
int Init() override;
|
||||
int Prepare() override;
|
||||
int Run() override;
|
||||
int InitWeights() override;
|
||||
void SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int CheckSpecs() override;
|
||||
|
||||
private:
|
||||
int InferShapeTo4D();
|
||||
|
@ -47,12 +50,19 @@ class SparseToDenseOpenCLKernel : public OpenCLKernel {
|
|||
float weight_scalar_{0.f};
|
||||
void *weight_vector_{nullptr};
|
||||
int input_dim_{1};
|
||||
int inshapeindex1_dim{1};
|
||||
cl_int stride_w{1};
|
||||
std::vector<int32_t> output_shape_;
|
||||
|
||||
size_t N_{1};
|
||||
size_t H_{1};
|
||||
size_t W_{1};
|
||||
size_t C_{1};
|
||||
cl_int n_{1};
|
||||
cl_int h_{1};
|
||||
cl_int w_{1};
|
||||
cl_int c_{1};
|
||||
|
||||
cl_int out_n_{1};
|
||||
cl_int out_h_{1};
|
||||
cl_int out_w_{1};
|
||||
cl_int out_c_{1};
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
#endif
|
||||
|
|
|
@ -112,7 +112,7 @@ struct Image2DInfo {
|
|||
size_t RowPitch() const {
|
||||
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
|
||||
int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment();
|
||||
size_t row_pitch = (width + alignment - 1) / alignment * alignment * FLT4_size;
|
||||
size_t row_pitch = UP_ROUND(width, alignment) * FLT4_size;
|
||||
return row_pitch;
|
||||
}
|
||||
|
||||
|
|
|
@ -98,8 +98,8 @@ TEST_F(TestArithmeticSelfOpenCLfp16, ArithmeticSelfOpenCLFp16) {
|
|||
}
|
||||
arithmeticself_kernel->Init();
|
||||
// to do allocate memory for inputs and outputs
|
||||
for (auto &input_tensor : inputs) {
|
||||
input_tensor->MallocData(allocator);
|
||||
for (auto &input_tensor_ : inputs) {
|
||||
input_tensor_->MallocData(allocator);
|
||||
}
|
||||
MS_LOG(INFO) << " initialize sub_graph ";
|
||||
std::vector<kernel::LiteKernel *> kernels{arithmeticself_kernel};
|
||||
|
@ -186,8 +186,8 @@ TEST_F(TestArithmeticSelfOpenCLCI, ArithmeticSelfRound) {
|
|||
}
|
||||
arithmeticself_kernel->Init();
|
||||
// to do allocate memory for inputs and outputs
|
||||
for (auto &input_tensor : inputs) {
|
||||
input_tensor->MallocData(allocator);
|
||||
for (auto &input_tensor_ : inputs) {
|
||||
input_tensor_->MallocData(allocator);
|
||||
}
|
||||
MS_LOG(INFO) << " initialize sub_graph ";
|
||||
std::vector<kernel::LiteKernel *> kernels{arithmeticself_kernel};
|
||||
|
@ -206,7 +206,6 @@ TEST_F(TestArithmeticSelfOpenCLCI, ArithmeticSelfRound) {
|
|||
}
|
||||
sub_graph->Init();
|
||||
MS_LOG(INFO) << " initialize input data ";
|
||||
std::cout << sizeof(input_data1) / sizeof(float) << std::endl;
|
||||
memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1));
|
||||
|
||||
std::cout << "==================output data================" << std::endl;
|
||||
|
@ -280,8 +279,8 @@ TEST_F(TestArithmeticSelfOpenCLfp16, ArithmeticSelfdim2Fp16) {
|
|||
}
|
||||
arithmeticself_kernel->Init();
|
||||
// to do allocate memory for inputs and outputs
|
||||
for (auto &input_tensor : inputs) {
|
||||
input_tensor->MallocData(allocator);
|
||||
for (auto &input_tensor_ : inputs) {
|
||||
input_tensor_->MallocData(allocator);
|
||||
}
|
||||
MS_LOG(INFO) << " initialize sub_graph ";
|
||||
std::vector<kernel::LiteKernel *> kernels{arithmeticself_kernel};
|
||||
|
|
|
@ -28,6 +28,78 @@ class TestSparseToDenseOpenCLCI : public mindspore::CommonTest {
|
|||
TestSparseToDenseOpenCLCI() {}
|
||||
};
|
||||
|
||||
TEST_F(TestSparseToDenseOpenCLCI, Fp32Dim2Shape3Vector) {
|
||||
MS_LOG(INFO) << " begin test ";
|
||||
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
|
||||
auto runtime = runtime_wrapper.GetInstance();
|
||||
runtime->Init();
|
||||
auto allocator = runtime->GetAllocator();
|
||||
|
||||
MS_LOG(INFO) << " init tensors ";
|
||||
std::vector<int> input_shape1 = {6, 3};
|
||||
std::vector<int> input_shape2 = {3};
|
||||
std::vector<int> input_shape3 = {6};
|
||||
std::vector<int> input_shape4 = {1};
|
||||
float input_data1[] = {0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, 5, 0, 0, 6};
|
||||
float input_data2[] = {6, 1, 10};
|
||||
float input_data3[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
|
||||
float input_data4[] = {0.0};
|
||||
float correctOutput[] = {1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
auto data_type = kNumberTypeFloat32;
|
||||
std::vector<int> output_shape = {6, 1, 10};
|
||||
auto in_tensor1 = Tensor(data_type, input_shape1, Format_NHWC, lite::Tensor::VAR);
|
||||
auto in_tensor2 = Tensor(data_type, input_shape2, Format_NHWC, lite::Tensor::CONST_TENSOR);
|
||||
auto in_tensor3 = Tensor(data_type, input_shape3, Format_NHWC, lite::Tensor::CONST_TENSOR);
|
||||
auto in_tensor4 = Tensor(data_type, input_shape4, Format_NHWC, lite::Tensor::CONST_SCALAR);
|
||||
auto output_tensor = Tensor(data_type, output_shape, Format_NHWC, lite::Tensor::VAR);
|
||||
// allocate memory for weights
|
||||
in_tensor2.MallocData();
|
||||
in_tensor3.MallocData();
|
||||
in_tensor4.MallocData();
|
||||
std::vector<lite::Tensor *> inputs{&in_tensor1, &in_tensor2, &in_tensor3, &in_tensor4};
|
||||
std::vector<lite::Tensor *> outputs{&output_tensor};
|
||||
// initialize weights
|
||||
memcpy(inputs[1]->data_c(), input_data2, sizeof(input_data2));
|
||||
memcpy(inputs[2]->data_c(), input_data3, sizeof(input_data3));
|
||||
memcpy(inputs[3]->data_c(), input_data4, sizeof(input_data4));
|
||||
MS_LOG(INFO) << " initialize tensors ";
|
||||
auto param = reinterpret_cast<SparseToDenseParameter *>(malloc(sizeof(SparseToDenseParameter)));
|
||||
if (param == nullptr) {
|
||||
MS_LOG(INFO) << " new ActivationParameter failed ";
|
||||
return;
|
||||
}
|
||||
|
||||
auto *sparse_to_dense_kernel =
|
||||
new (std::nothrow) kernel::SparseToDenseOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
|
||||
if (sparse_to_dense_kernel == nullptr) {
|
||||
MS_LOG(INFO) << " new kernel::SparseToDenseOpenCLKernel failed ";
|
||||
delete param;
|
||||
return;
|
||||
}
|
||||
sparse_to_dense_kernel->Init();
|
||||
MS_LOG(INFO) << " initialize sub_graph ";
|
||||
std::vector<kernel::LiteKernel *> kernels{sparse_to_dense_kernel};
|
||||
auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({&in_tensor1}, outputs, kernels, kernels, kernels);
|
||||
if (sub_graph == nullptr) {
|
||||
MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
|
||||
delete param;
|
||||
delete sparse_to_dense_kernel;
|
||||
return;
|
||||
}
|
||||
// to do allocate memory for inputs
|
||||
in_tensor1.MallocData(allocator);
|
||||
sub_graph->Init();
|
||||
MS_LOG(INFO) << " initialize input data ";
|
||||
memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1));
|
||||
|
||||
std::cout << "==================output data================" << std::endl;
|
||||
sub_graph->Run();
|
||||
auto *output_data_gpu = reinterpret_cast<float *>(output_tensor.data_c());
|
||||
ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor.ElementsNum(), 0.0001));
|
||||
delete sub_graph;
|
||||
}
|
||||
|
||||
TEST_F(TestSparseToDenseOpenCLCI, Fp32Dim2Scalar) {
|
||||
MS_LOG(INFO) << " begin test ";
|
||||
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
|
||||
|
|
Loading…
Reference in New Issue