forked from mindspore-Ecosystem/mindspore
!5876 [MS][LITE][Develop] add new ops named arithmetic_self
Merge pull request !5876 from pengyongrong/arithmetic_self
This commit is contained in:
commit
c263eff456
|
@ -0,0 +1,450 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
||||
|
||||
__kernel void ArithmeticSelf_ElementAbs_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X)));
|
||||
result.x = result.x >= 0 ? result.x : -result.x;
|
||||
result.y = result.y >= 0 ? result.y : -result.y;
|
||||
result.z = result.z >= 0 ? result.z : -result.z;
|
||||
result.w = result.w >= 0 ? result.w : -result.w;
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementAbs_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = result.x >= 0 ? result.x : -result.x;
|
||||
result.y = result.y >= 0 ? result.y : -result.y;
|
||||
result.z = result.z >= 0 ? result.z : -result.z;
|
||||
result.w = result.w >= 0 ? result.w : -result.w;
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementCos_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X)));
|
||||
result.x = cos(result.x);
|
||||
result.y = cos(result.y);
|
||||
result.z = cos(result.z);
|
||||
result.w = cos(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementCos_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = cos(result.x);
|
||||
result.y = cos(result.y);
|
||||
result.z = cos(result.z);
|
||||
result.w = cos(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementSin_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X)));
|
||||
result.x = sin(result.x);
|
||||
result.y = sin(result.y);
|
||||
result.z = sin(result.z);
|
||||
result.w = sin(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementSin_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = sin(result.x);
|
||||
result.y = sin(result.y);
|
||||
result.z = sin(result.z);
|
||||
result.w = sin(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementTanh_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X)));
|
||||
result.x = tanh(result.x);
|
||||
result.y = tanh(result.y);
|
||||
result.z = tanh(result.z);
|
||||
result.w = tanh(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementTanh_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = tanh(result.x);
|
||||
result.y = tanh(result.y);
|
||||
result.z = tanh(result.z);
|
||||
result.w = tanh(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementNeg_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X)));
|
||||
result.x = -result.x;
|
||||
result.y = -result.y;
|
||||
result.z = -result.z;
|
||||
result.w = -result.w;
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementNeg_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = -result.x;
|
||||
result.y = -result.y;
|
||||
result.z = -result.z;
|
||||
result.w = -result.w;
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementExp_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X)));
|
||||
result.x = exp(result.x);
|
||||
result.y = exp(result.y);
|
||||
result.z = exp(result.z);
|
||||
result.w = exp(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementExp_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = exp(result.x);
|
||||
result.y = exp(result.y);
|
||||
result.z = exp(result.z);
|
||||
result.w = exp(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementLog_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X)));
|
||||
result.x = result.x > 0 ? log(result.x) : HUGE_VALF;
|
||||
result.y = result.y > 0 ? log(result.y) : HUGE_VALF;
|
||||
result.z = result.z > 0 ? log(result.z) : HUGE_VALF;
|
||||
result.w = result.w > 0 ? log(result.w) : HUGE_VALF;
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementLog_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = result.x > 0 ? log(result.x) : HUGE_VALF;
|
||||
result.y = result.y > 0 ? log(result.y) : HUGE_VALF;
|
||||
result.z = result.z > 0 ? log(result.z) : HUGE_VALF;
|
||||
result.w = result.w > 0 ? log(result.w) : HUGE_VALF;
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementSquare_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X)));
|
||||
result.x = result.x * result.x;
|
||||
result.y = result.y * result.y;
|
||||
result.z = result.z * result.z;
|
||||
result.w = result.w * result.w;
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementSquare_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = result.x * result.x;
|
||||
result.y = result.y * result.y;
|
||||
result.z = result.z * result.z;
|
||||
result.w = result.w * result.w;
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementSqrt_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X)));
|
||||
result.x = result.x > 0 ? sqrt(result.x) : HUGE_VALF;
|
||||
result.y = result.y > 0 ? sqrt(result.y) : HUGE_VALF;
|
||||
result.z = result.z > 0 ? sqrt(result.z) : HUGE_VALF;
|
||||
result.w = result.w > 0 ? sqrt(result.w) : HUGE_VALF;
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementSqrt_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = result.x > 0 ? sqrt(result.x) : HUGE_VALF;
|
||||
result.y = result.y > 0 ? sqrt(result.y) : HUGE_VALF;
|
||||
result.z = result.z > 0 ? sqrt(result.z) : HUGE_VALF;
|
||||
result.w = result.w > 0 ? sqrt(result.w) : HUGE_VALF;
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementRsqrt_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X)));
|
||||
result.x = result.x > 0 ? 1.0f / sqrt(result.x) : HUGE_VALF;
|
||||
result.y = result.y > 0 ? 1.0f / sqrt(result.y) : HUGE_VALF;
|
||||
result.z = result.z > 0 ? 1.0f / sqrt(result.z) : HUGE_VALF;
|
||||
result.w = result.w > 0 ? 1.0f / sqrt(result.w) : HUGE_VALF;
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementRsqrt_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = result.x > 0 ? 1.0f / sqrt(result.x) : HUGE_VALF;
|
||||
result.y = result.y > 0 ? 1.0f / sqrt(result.y) : HUGE_VALF;
|
||||
result.z = result.z > 0 ? 1.0f / sqrt(result.z) : HUGE_VALF;
|
||||
result.w = result.w > 0 ? 1.0f / sqrt(result.w) : HUGE_VALF;
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementLogicalNot_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X)));
|
||||
result.x = result.x > 0 || result.x < 0 ? false : true;
|
||||
result.y = result.y > 0 || result.y < 0 ? false : true;
|
||||
result.z = result.z > 0 || result.z < 0 ? false : true;
|
||||
result.w = result.w > 0 || result.w < 0 ? false : true;
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementLogicalNot_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = result.x > 0 || result.x < 0 ? false : true;
|
||||
result.y = result.y > 0 || result.y < 0 ? false : true;
|
||||
result.z = result.z > 0 || result.z < 0 ? false : true;
|
||||
result.w = result.w > 0 || result.w < 0 ? false : true;
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementFloor_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X)));
|
||||
result.x = floor(result.x);
|
||||
result.y = floor(result.y);
|
||||
result.z = floor(result.z);
|
||||
result.w = floor(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementFloor_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = floor(result.x);
|
||||
result.y = floor(result.y);
|
||||
result.z = floor(result.z);
|
||||
result.w = floor(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementCeil_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X)));
|
||||
result.x = ceil(result.x);
|
||||
result.y = ceil(result.y);
|
||||
result.z = ceil(result.z);
|
||||
result.w = ceil(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementCeil_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = ceil(result.x);
|
||||
result.y = ceil(result.y);
|
||||
result.z = ceil(result.z);
|
||||
result.w = ceil(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementRound_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X)));
|
||||
result.x = round(result.x);
|
||||
result.y = round(result.y);
|
||||
result.z = round(result.z);
|
||||
result.w = round(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
|
||||
__kernel void ArithmeticSelf_ElementRound_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
|
||||
int4 output_shape) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
|
||||
result.x = round(result.x);
|
||||
result.y = round(result.y);
|
||||
result.z = round(result.z);
|
||||
result.w = round(result.w);
|
||||
WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
|
||||
}
|
|
@ -0,0 +1,240 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <cstring>
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
#include<string>
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/arithmetic_self.h"
|
||||
#include "src/runtime/kernel/opencl/cl/arithmeticself.cl.inc"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::schema::PrimitiveType_Abs;
|
||||
using mindspore::schema::PrimitiveType_Ceil;
|
||||
using mindspore::schema::PrimitiveType_Cos;
|
||||
using mindspore::schema::PrimitiveType_Exp;
|
||||
using mindspore::schema::PrimitiveType_Floor;
|
||||
using mindspore::schema::PrimitiveType_Log;
|
||||
using mindspore::schema::PrimitiveType_LogicalNot;
|
||||
using mindspore::schema::PrimitiveType_Neg;
|
||||
using mindspore::schema::PrimitiveType_Round;
|
||||
using mindspore::schema::PrimitiveType_Rsqrt;
|
||||
using mindspore::schema::PrimitiveType_Sin;
|
||||
using mindspore::schema::PrimitiveType_Sqrt;
|
||||
using mindspore::schema::PrimitiveType_Square;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
int ArithmeticSelfOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
|
||||
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
|
||||
size_t im_dst_x, im_dst_y;
|
||||
if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
|
||||
im_dst_x = out_tensors_[0]->Width() * CO4;
|
||||
im_dst_y = out_tensors_[0]->Height() * out_tensors_[0]->Batch();
|
||||
} else {
|
||||
im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * CO4;
|
||||
im_dst_x = out_tensors_[0]->Width();
|
||||
}
|
||||
size_t img_dtype = CL_FLOAT;
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto enable_fp16_ = ocl_runtime->GetFp16Enable();
|
||||
if (enable_fp16_) {
|
||||
img_dtype = CL_HALF_FLOAT;
|
||||
}
|
||||
img_size->clear();
|
||||
std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
|
||||
*img_size = vec;
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void ArithmeticSelfOpenCLKernel::GetKernelName(std::string *kernel_name, ArithmeticSelfParameter *param) {
|
||||
switch (param->op_parameter_.type_) {
|
||||
case PrimitiveType_Abs:
|
||||
kernel_name[0] += "_ElementAbs";
|
||||
break;
|
||||
case PrimitiveType_Cos:
|
||||
kernel_name[0] += "_ElementCos";
|
||||
break;
|
||||
case PrimitiveType_Exp:
|
||||
kernel_name[0] += "_ElementExp";
|
||||
break;
|
||||
case PrimitiveType_Log:
|
||||
kernel_name[0] += "_ElementLog";
|
||||
break;
|
||||
case PrimitiveType_Square:
|
||||
kernel_name[0] += "_ElementSquare";
|
||||
break;
|
||||
case PrimitiveType_Sqrt:
|
||||
kernel_name[0] += "_ElementSqrt";
|
||||
break;
|
||||
case PrimitiveType_Rsqrt:
|
||||
kernel_name[0] += "_ElementRsqrt";
|
||||
break;
|
||||
case PrimitiveType_Sin:
|
||||
kernel_name[0] += "_ElementSin";
|
||||
break;
|
||||
case PrimitiveType_LogicalNot:
|
||||
kernel_name[0] += "_ElementLogicalNot";
|
||||
break;
|
||||
case PrimitiveType_Floor:
|
||||
kernel_name[0] += "_ElementFloor";
|
||||
break;
|
||||
case PrimitiveType_Ceil:
|
||||
kernel_name[0] += "_ElementCeil";
|
||||
break;
|
||||
case PrimitiveType_Round:
|
||||
kernel_name[0] += "_ElementRound";
|
||||
case PrimitiveType_Neg:
|
||||
kernel_name[0] += "_ElementNeg";
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int ArithmeticSelfOpenCLKernel::Init() {
|
||||
if (in_tensors_[0]->shape().size() != 4) {
|
||||
MS_LOG(ERROR) << " only support dim = 4 ";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto param = reinterpret_cast<ArithmeticSelfParameter *>(this->op_parameter_);
|
||||
|
||||
auto in_format = op_format_;
|
||||
if (in_format != schema::Format_NHWC4 && in_format != schema::Format_NC4HW4) {
|
||||
MS_LOG(ERROR) << "input format(" << in_format << ") "
|
||||
<< "format not support!";
|
||||
return RET_ERROR;
|
||||
}
|
||||
in_ori_format_ = in_tensors_[0]->GetFormat();
|
||||
in_tensors_[0]->SetFormat(op_format_);
|
||||
out_ori_format_ = out_tensors_[0]->GetFormat();
|
||||
out_tensors_[0]->SetFormat(op_format_);
|
||||
|
||||
std::string kernel_name = "ArithmeticSelf";
|
||||
GetKernelName(&kernel_name, param);
|
||||
if (in_format == schema::Format_NC4HW4) {
|
||||
kernel_name += "_NC4HW4";
|
||||
} else if (in_format == schema::Format_NHWC4) {
|
||||
kernel_name += "_NHWC4";
|
||||
}
|
||||
MS_LOG(DEBUG) << "execute kernel name : " << kernel_name;
|
||||
std::set<std::string> build_options;
|
||||
std::string source = arithmeticself_source;
|
||||
std::string program_name = "ArithmeticSelf";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ArithmeticSelfOpenCLKernel::ReSize() { return RET_OK; }
|
||||
|
||||
int ArithmeticSelfGetBiggestDividerWithPriority(int number, int max_divider) {
|
||||
if (number % 8 == 0 && max_divider >= 8) {
|
||||
return number / 8;
|
||||
}
|
||||
if (number % 4 == 0 && 4 <= max_divider) {
|
||||
return number / 4;
|
||||
}
|
||||
if (number % 2 == 0 && 2 <= max_divider) {
|
||||
return number / 2;
|
||||
}
|
||||
|
||||
for (int i = max_divider; i != 0; i--) {
|
||||
if (number % i == 0) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void ArithmeticSelfGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
|
||||
const int max_divider = 8;
|
||||
const int max_x = 4, max_y = 8;
|
||||
int x = std::min(ArithmeticSelfGetBiggestDividerWithPriority(global[0], max_divider), max_x);
|
||||
int yz = max_size / x;
|
||||
int y = std::min(std::min(ArithmeticSelfGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y);
|
||||
int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
|
||||
|
||||
local->clear();
|
||||
local->push_back(x);
|
||||
local->push_back(y);
|
||||
local->push_back(z);
|
||||
}
|
||||
|
||||
int ArithmeticSelfOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto output_shape = out_tensors_[0]->shape();
|
||||
cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)};
|
||||
|
||||
uint32_t OH = output_shape[0] * output_shape[1]; // N*H
|
||||
uint32_t OW = output_shape[2];
|
||||
uint32_t OC = UP_DIV(output_shape[3], C4NUM);
|
||||
|
||||
const std::vector<size_t> &max_global = ocl_runtime->GetWorkItemSize();
|
||||
std::vector<size_t> local = {1, 1, 1}; // init local
|
||||
std::vector<size_t> global = {OH, OW, OC};
|
||||
ArithmeticSelfGetWorkGroup(global, &local, max_global[0]);
|
||||
|
||||
int arg_cn = 0;
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_);
|
||||
|
||||
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *OpenCLArithmeticSelfKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs,
|
||||
OpParameter *opParameter, const lite::Context *ctx,
|
||||
const kernel::KernelKey &desc,
|
||||
const mindspore::lite::PrimitiveC *primitive) {
|
||||
auto *kernel = new (std::nothrow) ArithmeticSelfOpenCLKernel(opParameter, inputs, outputs);
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << " new ArithmeticSelfOpenCLKernel failed ";
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = kernel->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << " Init kernel failed, name: ArithmeticSelf ";
|
||||
delete kernel;
|
||||
return nullptr;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Abs, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Ceil, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cos, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Exp, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Floor, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Log, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalNot, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Round, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Rsqrt, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sin, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Neg, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sqrt, OpenCLArithmeticSelfKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Square, OpenCLArithmeticSelfKernelCreator)
|
||||
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,52 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_SELF_PARAMETER_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_SELF_PARAMETER_H_
|
||||
|
||||
#include <vector>
|
||||
#include<string>
|
||||
#include "ir/anf.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "nnacl/arithmetic_self_parameter.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
class ArithmeticSelfOpenCLKernel : public OpenCLKernel {
|
||||
public:
|
||||
explicit ArithmeticSelfOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs)
|
||||
: OpenCLKernel(parameter, inputs, outputs) {}
|
||||
|
||||
~ArithmeticSelfOpenCLKernel() override{};
|
||||
|
||||
int Init() override;
|
||||
|
||||
int ReSize() override;
|
||||
|
||||
int Run() override;
|
||||
|
||||
int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
|
||||
|
||||
void GetKernelName(std::string *kernel_name, ArithmeticSelfParameter *param);
|
||||
|
||||
private:
|
||||
cl::Kernel kernel_;
|
||||
};
|
||||
|
||||
} // namespace mindspore::kernel
|
||||
#endif
|
|
@ -114,7 +114,11 @@ int TransposeOpenCLKernel::Run() {
|
|||
cl_int2 C = {c, c4};
|
||||
int arg_idx = 0;
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data());
|
||||
if (out_mem_type_ == OpenCLMemType::BUF) {
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data(), lite::opencl::MemType::BUF);
|
||||
} else {
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data());
|
||||
}
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, HW);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, C);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, w);
|
||||
|
|
|
@ -150,6 +150,7 @@ if (SUPPORT_GPU)
|
|||
${LITE_DIR}/src/runtime/kernel/opencl/kernel/matmul.cc
|
||||
${LITE_DIR}/src/runtime/kernel/opencl/kernel/softmax.cc
|
||||
${LITE_DIR}/src/runtime/kernel/opencl/kernel/concat.cc
|
||||
${LITE_DIR}/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
|
||||
${LITE_DIR}/src/runtime/kernel/opencl/kernel/batchnorm.cc
|
||||
${LITE_DIR}/src/runtime/kernel/opencl/kernel/slice.cc
|
||||
${LITE_DIR}/src/runtime/kernel/opencl/kernel/activation.cc
|
||||
|
@ -334,6 +335,7 @@ if (SUPPORT_GPU)
|
|||
${TEST_DIR}/ut/src/runtime/kernel/opencl/concat_tests.cc
|
||||
${TEST_DIR}/ut/src/runtime/kernel/opencl/batchnorm_tests.cc
|
||||
${TEST_DIR}/ut/src/runtime/kernel/opencl/slice_tests.cc
|
||||
${TEST_DIR}/ut/src/runtime/kernel/opencl/arithmetic_self_tests.cc
|
||||
${TEST_DIR}/ut/src/runtime/kernel/opencl/softmax_tests.cc
|
||||
${TEST_DIR}/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
|
||||
${TEST_DIR}/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc
|
||||
|
|
|
@ -0,0 +1,133 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include "utils/log_adapter.h"
|
||||
#include "common/common_test.h"
|
||||
#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
|
||||
#include "mindspore/lite/src/common/file_utils.h"
|
||||
#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
|
||||
#include "mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.h"
|
||||
|
||||
namespace mindspore {
|
||||
class TestArithmeticSelfOpenCLfp16 : public mindspore::CommonTest {
|
||||
public:
|
||||
TestArithmeticSelfOpenCLfp16() {}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void CompareOutputData1(T *input_data1, T *output_data, T *correct_data, int size, float err_bound) {
|
||||
for (size_t i = 0; i < 100; i++) {
|
||||
T abs = fabs(output_data[i] - correct_data[i]);
|
||||
ASSERT_LE(abs, err_bound);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(TestArithmeticSelfOpenCLfp16, ArithmeticSelfOpenCLFp16) {
|
||||
MS_LOG(INFO) << " begin test ";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
ocl_runtime->SetFp16Enable(true);
|
||||
ocl_runtime->Init();
|
||||
auto allocator = ocl_runtime->GetAllocator();
|
||||
|
||||
// get the input from .bin
|
||||
size_t input1_size, output_size;
|
||||
std::string input1Ppath = "./test_data/in_arithmetic_selffp16.bin";
|
||||
std::string correctOutputPath = "./test_data/out_arithmetic_selffp16.bin";
|
||||
auto input_data1 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size));
|
||||
auto correctOutput =
|
||||
reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
|
||||
|
||||
MS_LOG(INFO) << " init tensors ";
|
||||
|
||||
std::vector<int> shape = {1, 19, 19, 96};
|
||||
auto data_type = kNumberTypeFloat16;
|
||||
auto tensor_type = schema::NodeType_ValueNode;
|
||||
auto *input_tensor = new (std::nothrow) lite::tensor::Tensor(data_type, shape, schema::Format_NHWC, tensor_type);
|
||||
auto *output_tensor = new (std::nothrow) lite::tensor::Tensor(data_type, shape, schema::Format_NHWC, tensor_type);
|
||||
if (input_tensor == nullptr || output_tensor == nullptr) {
|
||||
MS_LOG(INFO) << " new input_tensor or output_tensor failed ";
|
||||
return;
|
||||
}
|
||||
std::vector<lite::tensor::Tensor *> inputs{input_tensor};
|
||||
std::vector<lite::tensor::Tensor *> outputs{output_tensor};
|
||||
|
||||
MS_LOG(INFO) << " initialize param ";
|
||||
auto param = new (std::nothrow) ArithmeticSelfParameter();
|
||||
if (param == nullptr) {
|
||||
MS_LOG(INFO) << " new ConcatParameter failed ";
|
||||
for (auto tensor : inputs) {
|
||||
delete tensor;
|
||||
}
|
||||
for (auto tensor : outputs) {
|
||||
delete tensor;
|
||||
}
|
||||
return;
|
||||
}
|
||||
param->op_parameter_.type_ = schema::PrimitiveType_Neg;
|
||||
auto *arithmeticself_kernel =
|
||||
new (std::nothrow) kernel::ArithmeticSelfOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
|
||||
if (arithmeticself_kernel == nullptr) {
|
||||
MS_LOG(INFO) << " new kernel::ArithmeticSelfOpenCLKernel failed ";
|
||||
for (auto tensor : inputs) {
|
||||
delete tensor;
|
||||
}
|
||||
for (auto tensor : outputs) {
|
||||
delete tensor;
|
||||
}
|
||||
delete param;
|
||||
return;
|
||||
}
|
||||
arithmeticself_kernel->SetFormatType(schema::Format_NC4HW4);
|
||||
arithmeticself_kernel->Init();
|
||||
// to do allocate memory for inputs and outputs
|
||||
for (auto &input_tensor : inputs) {
|
||||
input_tensor->MallocData(allocator);
|
||||
}
|
||||
MS_LOG(INFO) << " initialize sub_graph ";
|
||||
std::vector<kernel::LiteKernel *> kernels{arithmeticself_kernel};
|
||||
auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
|
||||
if (sub_graph == nullptr) {
|
||||
MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
|
||||
for (auto tensor : inputs) {
|
||||
delete tensor;
|
||||
}
|
||||
for (auto tensor : outputs) {
|
||||
delete tensor;
|
||||
}
|
||||
delete param;
|
||||
delete arithmeticself_kernel;
|
||||
return;
|
||||
}
|
||||
sub_graph->Init();
|
||||
MS_LOG(INFO) << " initialize input data ";
|
||||
memcpy(inputs[0]->Data(), input_data1, input1_size);
|
||||
|
||||
std::cout << "==================output data================" << std::endl;
|
||||
sub_graph->Run();
|
||||
auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->Data());
|
||||
CompareOutputData1(input_data1, output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001);
|
||||
for (auto tensor : inputs) {
|
||||
delete tensor;
|
||||
}
|
||||
for (auto tensor : outputs) {
|
||||
delete tensor;
|
||||
}
|
||||
delete param;
|
||||
delete arithmeticself_kernel;
|
||||
delete sub_graph;
|
||||
}
|
||||
} // namespace mindspore
|
Loading…
Reference in New Issue