change opencl code path
This commit is contained in:
parent
98565d8b54
commit
605c2b0b16
30
build.sh
30
build.sh
|
@ -460,24 +460,20 @@ build_gtest() {
|
|||
|
||||
gene_clhpp() {
|
||||
CL_SRC_DIR="${BASEPATH}/mindspore/lite/src/runtime/kernel/opencl/cl"
|
||||
for sub_dir in "${CL_SRC_DIR}"/*
|
||||
if [ ! -d ${CL_SRC_DIR} ]; then
|
||||
return
|
||||
fi
|
||||
cd ${CL_SRC_DIR}/
|
||||
rm -rf *.inc
|
||||
echo "$(cd "$(dirname $0)"; pwd)"
|
||||
for file_path in "${CL_SRC_DIR}"/*
|
||||
do
|
||||
data_type="$(basename ${sub_dir})"
|
||||
if [ ! -d ${CL_SRC_DIR}/${data_type} ]; then
|
||||
continue
|
||||
fi
|
||||
cd ${CL_SRC_DIR}/${data_type}
|
||||
rm -rf *.inc
|
||||
echo "$(cd "$(dirname $0)"; pwd)"
|
||||
for file_path in "${CL_SRC_DIR}/${data_type}"/*
|
||||
do
|
||||
file="$(basename ${file_path})"
|
||||
inc_file=`echo ${CL_SRC_DIR}/${data_type}/${file} | sed 's/$/.inc/'`
|
||||
sed 's/^/\"/;s/$/ \\n\" \\/' ${CL_SRC_DIR}/${data_type}/${file} > ${inc_file}
|
||||
kernel_name=`echo ${file} | sed s'/.\{3\}$//'`
|
||||
sed -i "1i\static const char *${kernel_name}_source_${data_type} =\"\\n\" \\" ${inc_file}
|
||||
sed -i '$a\;' ${inc_file}
|
||||
done
|
||||
file="$(basename ${file_path})"
|
||||
inc_file=`echo ${CL_SRC_DIR}/${file} | sed 's/$/.inc/'`
|
||||
sed 's/^/\"/;s/$/ \\n\" \\/' ${CL_SRC_DIR}/${file} > ${inc_file}
|
||||
kernel_name=`echo ${file} | sed s'/.\{3\}$//'`
|
||||
sed -i "1i\static const char *${kernel_name}_source =\"\\n\" \\" ${inc_file}
|
||||
sed -i '$a\;' ${inc_file}
|
||||
done
|
||||
}
|
||||
|
||||
|
|
|
@ -2,10 +2,7 @@
|
|||
|
||||
#define SLICES 4
|
||||
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
|
||||
#define FLT4 float4
|
||||
#define MIN(X, Y) (X < Y ? X : Y)
|
||||
#define READ_FLT4 read_imagef
|
||||
#define WRITE_FLT4 write_imagef
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
|
||||
__kernel void ReluScalar(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
|
||||
|
@ -14,13 +11,13 @@ __kernel void ReluScalar(__read_only image2d_t input, __write_only image2d_t out
|
|||
int Y = get_global_id(0); // height id
|
||||
int X = get_global_id(1); // weight id
|
||||
for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
|
||||
FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
|
||||
FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
|
||||
FLT4 tmp;
|
||||
tmp.x = in_c4.x >= 0 ? in_c4.x : in_c4.x * alpha;
|
||||
tmp.y = in_c4.y >= 0 ? in_c4.y : in_c4.y * alpha;
|
||||
tmp.z = in_c4.z >= 0 ? in_c4.z : in_c4.z * alpha;
|
||||
tmp.w = in_c4.w >= 0 ? in_c4.w : in_c4.w * alpha;
|
||||
WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
|
||||
WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -29,13 +26,13 @@ __kernel void Relu(__read_only image2d_t input, __write_only image2d_t output, c
|
|||
int Y = get_global_id(0); // height id
|
||||
int X = get_global_id(1); // weight id
|
||||
for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
|
||||
FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
|
||||
FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
|
||||
FLT4 tmp;
|
||||
tmp.x = in_c4.x >= 0 ? in_c4.x : 0;
|
||||
tmp.y = in_c4.y >= 0 ? in_c4.y : 0;
|
||||
tmp.z = in_c4.z >= 0 ? in_c4.z : 0;
|
||||
tmp.w = in_c4.w >= 0 ? in_c4.w : 0;
|
||||
WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
|
||||
WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -44,13 +41,13 @@ __kernel void Relu6(__read_only image2d_t input, __write_only image2d_t output,
|
|||
int Y = get_global_id(0); // height id
|
||||
int X = get_global_id(1); // weight id
|
||||
for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
|
||||
FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
|
||||
FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
|
||||
FLT4 tmp;
|
||||
tmp.x = in_c4.x >= 0 ? MIN(in_c4.x, 6) : 0;
|
||||
tmp.y = in_c4.y >= 0 ? MIN(in_c4.y, 6) : 0;
|
||||
tmp.z = in_c4.z >= 0 ? MIN(in_c4.z, 6) : 0;
|
||||
tmp.w = in_c4.w >= 0 ? MIN(in_c4.w, 6) : 0;
|
||||
WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
|
||||
WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -59,12 +56,12 @@ __kernel void Sigmoid(__read_only image2d_t input, __write_only image2d_t output
|
|||
int Y = get_global_id(0); // height id
|
||||
int X = get_global_id(1); // weight id
|
||||
for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
|
||||
FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
|
||||
FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
|
||||
FLT4 tmp;
|
||||
tmp.x = 1 / (1 + exp(-in_c4.x));
|
||||
tmp.y = 1 / (1 + exp(-in_c4.y));
|
||||
tmp.z = 1 / (1 + exp(-in_c4.z));
|
||||
tmp.w = 1 / (1 + exp(-in_c4.w));
|
||||
WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
|
||||
WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
|
||||
}
|
||||
}
|
|
@ -0,0 +1,101 @@
|
|||
#define divide_no_check(a, b) (a / b)
|
||||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
||||
|
||||
__kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
|
||||
__write_only image2d_t output, const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
|
||||
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
|
||||
WRITE_IMAGE(output, (int2)(X, Y), a + b);
|
||||
}
|
||||
|
||||
__kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
|
||||
__write_only image2d_t output, const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
|
||||
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
|
||||
WRITE_IMAGE(output, (int2)(X, Y), a - b);
|
||||
}
|
||||
|
||||
__kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
|
||||
__write_only image2d_t output, const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
|
||||
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
|
||||
WRITE_IMAGE(output, (int2)(X, Y), a * b);
|
||||
}
|
||||
|
||||
__kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
|
||||
__write_only image2d_t output, const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
|
||||
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
|
||||
WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, b));
|
||||
}
|
||||
|
||||
__kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
|
||||
const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
|
||||
WRITE_IMAGE(output, (int2)(X, Y), weight * a + bias);
|
||||
}
|
||||
|
||||
__kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] + input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void ElementSub_BUF(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] - input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void ElementMul_BUF(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] * input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void ElementDiv_BUF(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] * input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void BoardcastArith_BUF(__global float *input_a, float weight, float bias, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = weight * input_a[idx] + bias;
|
||||
}
|
|
@ -1,4 +1,3 @@
|
|||
#define FLT4 float4
|
||||
#define INT4 int4
|
||||
#define INT2 int2
|
||||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
|
@ -2,9 +2,6 @@
|
|||
|
||||
#define SLICES 4
|
||||
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
|
||||
#define FLT4 float4
|
||||
#define READ_FLT4 read_imagef
|
||||
#define WRITE_FLT4 write_imagef
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
|
||||
__kernel void CaffePRelu(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
|
||||
|
@ -14,13 +11,13 @@ __kernel void CaffePRelu(__read_only image2d_t input, __write_only image2d_t out
|
|||
int Y = get_global_id(0); // height id
|
||||
int X = get_global_id(1); // weight id
|
||||
for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
|
||||
FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
|
||||
FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y)); // NHWC4: H WC
|
||||
FLT4 tmp;
|
||||
int index = num * 4;
|
||||
tmp.x = in_c4.x * alpha[index];
|
||||
tmp.y = in_c4.y * alpha[index + 1];
|
||||
tmp.z = in_c4.z * alpha[index + 2];
|
||||
tmp.w = in_c4.w * alpha[index + 3];
|
||||
WRITE_FLT4(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
|
||||
WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp); // NHWC4: H WC
|
||||
}
|
||||
}
|
|
@ -1,5 +1,4 @@
|
|||
// #pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#define FLT4 float4
|
||||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
||||
|
||||
__kernel void Concat(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output,
|
|
@ -1,8 +1,3 @@
|
|||
#define FLT float
|
||||
#define FLT4 float4
|
||||
#define FLT16 float16
|
||||
#define READ_IMAGE read_imagef
|
||||
#define WRITE_IMAGE write_imagef
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases,
|
||||
__write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding,
|
|
@ -1,12 +1,3 @@
|
|||
#ifdef ENABLE_FP16
|
||||
#define FLT half
|
||||
#define FLT4 half4
|
||||
#define TO_FLT4 convert_half4
|
||||
#else
|
||||
#define FLT float
|
||||
#define FLT4 float4
|
||||
#define TO_FLT4 convert_float4
|
||||
#endif
|
||||
__constant sampler_t sampler_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
|
||||
float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
|
|
@ -1,61 +0,0 @@
|
|||
#define FLT half
|
||||
#define FLT4 half4
|
||||
#define FLT16 half16
|
||||
#define READ_IMAGE read_imageh
|
||||
#define WRITE_IMAGE write_imageh
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases,
|
||||
__write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding,
|
||||
int4 src_size, int4 dst_size) {
|
||||
int h = get_global_id(0);
|
||||
int kh = h % 2;
|
||||
int src_h = h / 2;
|
||||
src_h = src_h * 2;
|
||||
int w = get_global_id(1);
|
||||
int kw = w % 2;
|
||||
int src_w = w / 2;
|
||||
src_w = src_w * 2;
|
||||
int co = get_global_id(2);
|
||||
if (src_h * 2 >= dst_size.x || src_w * 2 >= dst_size.y || co >= dst_size.z) return;
|
||||
FLT4 r0 = (FLT4)(0.f);
|
||||
FLT4 r1 = (FLT4)(0.f);
|
||||
FLT4 r2 = (FLT4)(0.f);
|
||||
FLT4 r3 = (FLT4)(0.f);
|
||||
int base_w = (co * 4 + kh + kw * 2) * src_size.z;
|
||||
for (int ci = 0; ci < src_size.z; ++ci) {
|
||||
FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h));
|
||||
FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h + 1));
|
||||
FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h));
|
||||
FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h + 1));
|
||||
FLT16 weight_cache = weight[base_w++];
|
||||
r0 += x0.x * weight_cache.s0123;
|
||||
r0 += x0.y * weight_cache.s4567;
|
||||
r0 += x0.z * weight_cache.s89ab;
|
||||
r0 += x0.w * weight_cache.scdef;
|
||||
|
||||
r1 += x1.x * weight_cache.s0123;
|
||||
r1 += x1.y * weight_cache.s4567;
|
||||
r1 += x1.z * weight_cache.s89ab;
|
||||
r1 += x1.w * weight_cache.scdef;
|
||||
|
||||
r2 += x2.x * weight_cache.s0123;
|
||||
r2 += x2.y * weight_cache.s4567;
|
||||
r2 += x2.z * weight_cache.s89ab;
|
||||
r2 += x2.w * weight_cache.scdef;
|
||||
|
||||
r3 += x3.x * weight_cache.s0123;
|
||||
r3 += x3.y * weight_cache.s4567;
|
||||
r3 += x3.z * weight_cache.s89ab;
|
||||
r3 += x3.w * weight_cache.scdef;
|
||||
}
|
||||
FLT4 bias_val = READ_IMAGE(biases, smp_zero, (int2)(co, 0));
|
||||
r0 += bias_val;
|
||||
r1 += bias_val;
|
||||
r2 += bias_val;
|
||||
r3 += bias_val;
|
||||
|
||||
WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh), r0);
|
||||
WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh + 2), r1);
|
||||
WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh), r2);
|
||||
WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh + 2), r3);
|
||||
}
|
|
@ -1,76 +0,0 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#define ACCUM_FLT4 half4
|
||||
#define FLT half
|
||||
#define FLT2 half2
|
||||
#define FLT3 half3
|
||||
#define FLT4 half4
|
||||
#define TO_FLT4 convert_half4
|
||||
#define TO_ACCUM_TYPE convert_half4
|
||||
#define TO_ACCUM_FLT convert_half
|
||||
#define READ_IMAGE read_imagef
|
||||
#define WRITE_IMAGE write_imagef
|
||||
__constant sampler_t smp_edge = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
|
||||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void DepthwiseConv2d_NC4HW4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases,
|
||||
float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
|
||||
int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
|
||||
ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
int x_offseted = X * stride.x + padding.x;
|
||||
int y_offseted = Y * stride.y + padding.y;
|
||||
int fx_c = Z * kernel_size.x * kernel_size.y;
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = y_offseted + ky * dilation.y;
|
||||
bool outside_y = y_c < 0 || y_c >= src_size.y;
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = x_offseted + kx * dilation.x;
|
||||
bool outside_x = x_c < 0 || x_c >= src_size.x;
|
||||
if (!outside_x && !outside_y) {
|
||||
FLT4 f = filters[fx_c];
|
||||
FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
|
||||
r += TO_ACCUM_TYPE(src_final * f);
|
||||
}
|
||||
fx_c++;
|
||||
}
|
||||
}
|
||||
FLT4 bias_val = biases[Z];
|
||||
FLT4 res0 = TO_FLT4(r) + bias_val;
|
||||
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
|
||||
dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res0;
|
||||
}
|
||||
|
||||
__kernel void DepthwiseConv2d_NHWC4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases,
|
||||
float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
|
||||
int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
|
||||
ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
int x_offseted = X * stride.x + padding.x;
|
||||
int y_offseted = Y * stride.y + padding.y;
|
||||
int fx_c = Z * kernel_size.x * kernel_size.y;
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = y_offseted + ky * dilation.y;
|
||||
bool outside_y = y_c < 0 || y_c >= src_size.y;
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = x_offseted + kx * dilation.x;
|
||||
bool outside_x = x_c < 0 || x_c >= src_size.x;
|
||||
if (!outside_x && !outside_y) {
|
||||
FLT4 f = filters[fx_c];
|
||||
FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
|
||||
r += TO_ACCUM_TYPE(src_final * f);
|
||||
}
|
||||
fx_c++;
|
||||
}
|
||||
}
|
||||
FLT4 bias_val = biases[Z];
|
||||
FLT4 res0 = TO_FLT4(r) + bias_val;
|
||||
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
|
||||
dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
|
||||
}
|
|
@ -1,45 +0,0 @@
|
|||
#define FLT half
|
||||
#define FLT4 half4
|
||||
#define READ_IMAGE read_imageh
|
||||
#define WRITE_IMAGE write_imageh
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void transpose(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= HW.y || Y >= C.y) {
|
||||
return;
|
||||
}
|
||||
FLT4 result[4];
|
||||
result[0] = (FLT4)(0.0f);
|
||||
result[1] = (FLT4)(0.0f);
|
||||
result[2] = (FLT4)(0.0f);
|
||||
result[3] = (FLT4)(0.0f);
|
||||
FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X));
|
||||
FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 1));
|
||||
FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 2));
|
||||
FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 3));
|
||||
result[0].x = x0.x;
|
||||
result[0].y = x1.x;
|
||||
result[0].z = x2.x;
|
||||
result[0].w = x3.x;
|
||||
|
||||
result[1].x = x0.y;
|
||||
result[1].y = x1.y;
|
||||
result[1].z = x2.y;
|
||||
result[1].w = x3.y;
|
||||
|
||||
result[2].x = x0.z;
|
||||
result[2].y = x1.z;
|
||||
result[2].z = x2.z;
|
||||
result[2].w = x3.z;
|
||||
|
||||
result[3].x = x0.w;
|
||||
result[3].y = x1.w;
|
||||
result[3].z = x2.w;
|
||||
result[3].w = x3.w;
|
||||
|
||||
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y), result[0]);
|
||||
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 1), result[1]);
|
||||
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 2), result[2]);
|
||||
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]);
|
||||
}
|
|
@ -1,34 +0,0 @@
|
|||
__kernel void ElementAdd(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] + input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void ElementSub(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] - input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void ElementMul(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] * input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void ElementDiv(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] / input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void BoardcastArith(__global float *input_a, float weight, float bias, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = weight * input_a[idx] + bias;
|
||||
}
|
|
@ -1,66 +0,0 @@
|
|||
#define divide_no_check(a, b) (a/b)
|
||||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
||||
|
||||
__kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
|
||||
const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
|
||||
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
|
||||
write_imagef(output, (int2)(X, Y), a + b);
|
||||
}
|
||||
|
||||
__kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
|
||||
const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
|
||||
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
|
||||
write_imagef(output, (int2)(X, Y), a - b);
|
||||
}
|
||||
|
||||
__kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
|
||||
const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
|
||||
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
|
||||
write_imagef(output, (int2)(X, Y), a * b);
|
||||
}
|
||||
|
||||
__kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
|
||||
const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
|
||||
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
|
||||
write_imagef(output, (int2)(X, Y), divide_no_check(a, b));
|
||||
}
|
||||
|
||||
__kernel void BoardcastArith(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
|
||||
const int2 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
|
||||
write_imagef(output, (int2)(X, Y), weight * a + bias);
|
||||
}
|
|
@ -1,32 +0,0 @@
|
|||
#define FLT4 float4
|
||||
#define FLT16 float16
|
||||
#define READ_IMAGE read_imagef
|
||||
#define WRITE_IMAGE write_imagef
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
|
||||
__write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
|
||||
int2 gid = (int2)(get_global_id(0), get_global_id(1));
|
||||
int2 lid = (int2)(get_local_id(0), get_local_id(1));
|
||||
FLT4 result = (FLT4)(0.0f);
|
||||
bool inside = gid.x < offset_co.y;
|
||||
for (uint i = lid.y; i < offset_ci.y && inside; i += 4) {
|
||||
FLT4 v = READ_IMAGE(input, smp_zero, (int2)(i, 0));
|
||||
FLT16 w = weight[gid.x + i * offset_co.y];
|
||||
result.x += dot(v, w.s0123);
|
||||
result.y += dot(v, w.s4567);
|
||||
result.z += dot(v, w.s89ab);
|
||||
result.w += dot(v, w.scdef);
|
||||
}
|
||||
__local FLT4 temp[64][4];
|
||||
temp[lid.x][lid.y] = result;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (lid.y == 0 && inside) {
|
||||
result += temp[lid.x][1];
|
||||
result += temp[lid.x][2];
|
||||
result += temp[lid.x][3];
|
||||
if (has_bias != 0) {
|
||||
result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0));
|
||||
}
|
||||
WRITE_IMAGE(output, (int2)(gid.x, 0), result);
|
||||
}
|
||||
}
|
|
@ -1,14 +0,0 @@
|
|||
#define FLT float
|
||||
#define FLT4 float4
|
||||
#define READ_IMAGE read_imagef
|
||||
#define WRITE_IMAGE write_imagef
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
|
||||
}
|
|
@ -1,7 +1,3 @@
|
|||
#define FLT4 half4
|
||||
#define FLT16 half16
|
||||
#define READ_IMAGE read_imageh
|
||||
#define WRITE_IMAGE write_imageh
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
|
||||
__write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
|
|
@ -1,7 +1,3 @@
|
|||
#define FLT half
|
||||
#define FLT4 half4
|
||||
#define READ_IMAGE read_imageh
|
||||
#define WRITE_IMAGE write_imageh
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
|
||||
int X = get_global_id(0);
|
|
@ -1,5 +1,5 @@
|
|||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
||||
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
// what is mask and args.slices_x32
|
||||
__kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask,
|
||||
const int slices, const int slices_x32) {
|
||||
|
@ -54,11 +54,11 @@ __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *outpu
|
|||
int tid = get_local_id(0);
|
||||
float sum = 0.0f;
|
||||
for (size_t i = tid; i < slices - 1; i += 32) {
|
||||
float4 src = read_imagef(input, smp_none, (int2)(i, 0));
|
||||
float4 src = read_imagef(input, smp_zero, (int2)(i, 0));
|
||||
sum += dot((float4)(1.0f), exp(src));
|
||||
}
|
||||
if ((slices - 1) % 32 == tid) {
|
||||
float4 src = read_imagef(input, smp_none, (int2)(slices - 1, 0));
|
||||
float4 src = read_imagef(input, smp_zero, (int2)(slices - 1, 0));
|
||||
sum += dot(mask, exp(src));
|
||||
}
|
||||
|
||||
|
@ -80,12 +80,12 @@ __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *outpu
|
|||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
sum = tmpx1[0];
|
||||
for (size_t i = tid; i < slices - 1; i += 32) {
|
||||
float4 result = read_imagef(input, smp_none, (int2)(i, 0));
|
||||
float4 result = read_imagef(input, smp_zero, (int2)(i, 0));
|
||||
result = exp(result) * sum;
|
||||
output[i] = result;
|
||||
}
|
||||
if ((slices - 1) % 32 == tid) {
|
||||
float4 result = read_imagef(input, smp_none, (int2)(slices - 1, 0));
|
||||
float4 result = read_imagef(input, smp_zero, (int2)(slices - 1, 0));
|
||||
result = exp(result) * sum;
|
||||
__global float4 *remain_ptr4 = output;
|
||||
remain_ptr4 += slices - 1;
|
|
@ -1,22 +1,3 @@
|
|||
#define FLT float
|
||||
#define FLT4 float4
|
||||
#define READ_IMAGE read_imagef
|
||||
#define WRITE_IMAGE write_imagef
|
||||
// enum Format {
|
||||
// Format_NCHW = 0,
|
||||
// Format_NHWC = 1,
|
||||
// Format_NHWC4 = 2,
|
||||
// Format_HWKC = 3,
|
||||
// Format_HWCK = 4,
|
||||
// Format_KCHW = 5,
|
||||
// Format_CKHW = 6,
|
||||
// Format_KHWC = 7,
|
||||
// Format_CHWK = 8,
|
||||
// Format_NC4HW4 = 100,
|
||||
// Format_NUM_OF_FORMAT = 101,
|
||||
// Format_MIN = Format_NCHW,
|
||||
// Format_MAX = Format_NUM_OF_FORMAT
|
||||
//};
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void to_format_NCHW_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
|
||||
int4 shape) {
|
|
@ -1,7 +1,3 @@
|
|||
#define FLT float
|
||||
#define FLT4 float4
|
||||
#define READ_IMAGE read_imagef
|
||||
#define WRITE_IMAGE write_imagef
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
|
||||
int X = get_global_id(0);
|
|
@ -24,7 +24,7 @@
|
|||
#include "src/runtime/runtime_api.h"
|
||||
#include "include/errorcode.h"
|
||||
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/activation.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/activation.cl.inc"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
|
@ -46,7 +46,7 @@ int ActivationOpenClKernel::Init() {
|
|||
}
|
||||
std::string program_name = "";
|
||||
std::string kernel_name = "";
|
||||
std::string source = activation_source_fp32;
|
||||
std::string source = activation_source;
|
||||
if (type_ == ActivationType_RELU) {
|
||||
program_name = "RELU";
|
||||
kernel_name = "Relu";
|
||||
|
|
|
@ -22,8 +22,7 @@
|
|||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/kernel/opencl/utils.h"
|
||||
#ifndef PROGRAM_WITH_IL
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/arithmetic.cl.inc"
|
||||
#endif
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
|
@ -109,9 +108,14 @@ int ArithmeticOpenCLKernel::Init() {
|
|||
error_code = RET_ERROR;
|
||||
}
|
||||
#else
|
||||
if (out_mem_type_ == OpenCLMemType::IMG) {
|
||||
kernel_name += "_IMG";
|
||||
} else {
|
||||
kernel_name += "_BUF";
|
||||
}
|
||||
std::string program_name = "Arithmetic";
|
||||
std::set<std::string> build_options;
|
||||
std::string source = arithmetic_image2d_source_fp32;
|
||||
std::string source = arithmetic_source;
|
||||
runtime_->LoadSource(program_name, source);
|
||||
error_code = runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
#endif
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/batchnorm.h"
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/batchnorm.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/batchnorm.cl.inc"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
|
@ -50,7 +50,7 @@ int BatchNormOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_siz
|
|||
}
|
||||
int BatchNormOpenCLKernel::Init() {
|
||||
std::set<std::string> build_options;
|
||||
std::string source = batchnorm_source_fp32;
|
||||
std::string source = batchnorm_source;
|
||||
std::string program_name = "batch_normalization";
|
||||
std::string kernel_name = "batch_normalization";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
#include "include/errorcode.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/caffe_prelu.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/caffe_prelu.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/caffe_prelu.cl.inc"
|
||||
#include "src/runtime/kernel/arm/nnacl/caffeprelu.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
|
@ -57,7 +57,7 @@ int CaffePReluOpenCLKernel::Init() {
|
|||
}
|
||||
CaffeWeight();
|
||||
std::set<std::string> build_options;
|
||||
std::string source = caffe_prelu_source_fp32;
|
||||
std::string source = caffe_prelu_source;
|
||||
std::string program_name = "CaffePRelu";
|
||||
std::string kernel_name = "CaffePRelu";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/concat.h"
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/concat.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/concat.cl.inc"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
|
@ -66,7 +66,7 @@ int ConcatOpenCLKernel::Init() {
|
|||
}
|
||||
if (in_tensors_.size() == 2) {
|
||||
std::set<std::string> build_options;
|
||||
std::string source = concat_source_fp32;
|
||||
std::string source = concat_source;
|
||||
std::string program_name = "Concat";
|
||||
std::string kernel_name = "Concat";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
|
@ -76,7 +76,7 @@ int ConcatOpenCLKernel::Init() {
|
|||
|
||||
if (in_tensors_.size() == 3) {
|
||||
std::set<std::string> build_options;
|
||||
std::string source = concat_source_fp32;
|
||||
std::string source = concat_source;
|
||||
std::string program_name = "Concat3input";
|
||||
std::string kernel_name = "Concat3input";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
|
|
|
@ -20,8 +20,7 @@
|
|||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/conv2d_transpose.h"
|
||||
#ifndef PROGRAM_WITH_IL
|
||||
#include "src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/conv2d_transpose2x2.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl.inc"
|
||||
#endif
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
|
@ -45,11 +44,7 @@ int Conv2dTransposeOpenCLKernel::Init() {
|
|||
#ifdef PROGRAM_WITH_IL
|
||||
ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
|
||||
#else
|
||||
#ifdef ENABLE_FP16
|
||||
std::string source = conv2d_transpose2x2_source_fp16;
|
||||
#else
|
||||
std::string source = conv2d_transpose2x2_source_fp32;
|
||||
#endif
|
||||
std::string source = conv2d_transpose2x2_source;
|
||||
std::set<std::string> build_options;
|
||||
std::string program_name = "conv2d_transpose2x2";
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
|
|
|
@ -26,8 +26,7 @@
|
|||
|
||||
#ifndef PROGRAM_WITH_IL
|
||||
|
||||
#include "src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/depthwise_conv2d.cl.inc"
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -68,11 +67,7 @@ int DepthwiseConv2dOpenCLKernel::Init() {
|
|||
#else
|
||||
std::string program_name = "DepthwiseConv2d";
|
||||
std::set<std::string> build_options;
|
||||
#ifdef ENABLE_FP16
|
||||
std::string source = depthwise_conv2d_source_fp16;
|
||||
#else
|
||||
std::string source = depthwise_conv2d_source_fp32;
|
||||
#endif
|
||||
std::string source = depthwise_conv2d_source;
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
#endif
|
||||
|
|
|
@ -21,8 +21,7 @@
|
|||
#include "src/runtime/kernel/arm/nnacl/fp32/matmul.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/matmul.h"
|
||||
#ifndef PROGRAM_WITH_IL
|
||||
#include "src/runtime/kernel/opencl/cl/fp16/matmul.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/matmul.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/matmul.cl.inc"
|
||||
#endif
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
|
@ -40,11 +39,7 @@ int MatMulOpenCLKernel::Init() {
|
|||
ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
|
||||
#else
|
||||
std::set<std::string> build_options;
|
||||
#ifdef ENABLE_FP16
|
||||
std::string source = matmul_source_fp16;
|
||||
#else
|
||||
std::string source = matmul_source_fp32;
|
||||
#endif
|
||||
std::string source = matmul_source;
|
||||
std::string program_name = "MatMul";
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
|
|
|
@ -24,8 +24,8 @@
|
|||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/image_format.h"
|
||||
#ifndef PROGRAM_WITH_IL
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/avg_pool2d.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/max_pool2d.cl.inc"
|
||||
#endif
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
|
@ -46,13 +46,13 @@ int PoolingOpenCLKernel::Init() {
|
|||
if (parameter_->max_pooling_) {
|
||||
kernel_name = "MaxPooling2d";
|
||||
#ifndef PROGRAM_WITH_IL
|
||||
source = max_pool2d_source_fp32;
|
||||
source = max_pool2d_source;
|
||||
program_name = "MaxPooling2d";
|
||||
#endif
|
||||
} else if (parameter_->avg_pooling_) {
|
||||
kernel_name = "AvgPooling2d";
|
||||
#ifndef PROGRAM_WITH_IL
|
||||
source = avg_pool2d_source_fp32;
|
||||
source = avg_pool2d_source;
|
||||
program_name = "AvgPooling2d";
|
||||
#endif
|
||||
} else {
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
#include "include/errorcode.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/prelu.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/activation.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/activation.cl.inc"
|
||||
#include "src/runtime/kernel/arm/nnacl/prelu_parameter.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
|
@ -40,7 +40,7 @@ int PReluOpenCLKernel::Init() {
|
|||
return RET_ERROR;
|
||||
}
|
||||
std::set<std::string> build_options;
|
||||
std::string source = activation_source_fp32;
|
||||
std::string source = activation_source;
|
||||
std::string program_name = "PRelu";
|
||||
std::string kernel_name = "ReluScalar";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
|
|
|
@ -20,8 +20,7 @@
|
|||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/reshape.h"
|
||||
#include "src/runtime/kernel/opencl/cl/fp16/reshape.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/reshape.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/reshape.cl.inc"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
|
@ -39,11 +38,7 @@ int ReshapeOpenCLKernel::Init() {
|
|||
ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
|
||||
#else
|
||||
std::set<std::string> build_options;
|
||||
#ifdef ENABLE_FP16
|
||||
std::string source = reshape_source_fp16;
|
||||
#else
|
||||
std::string source = reshape_source_fp32;
|
||||
#endif
|
||||
std::string source = reshape_source;
|
||||
std::string program_name = "reshape";
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
|
|
|
@ -22,8 +22,8 @@
|
|||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/utils.h"
|
||||
#ifndef PROGRAM_WITH_IL
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/softmax.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/softmax.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/softmax1x1.cl.inc"
|
||||
#endif
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
|
@ -88,7 +88,7 @@ int SoftmaxOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
|
|||
int SoftmaxOpenCLKernel::Init() {
|
||||
std::string kernel_name = "SoftMax";
|
||||
std::string program_name = "SoftMax";
|
||||
std::string source = softmax_source_fp32;
|
||||
std::string source = softmax_source;
|
||||
runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
// framework not set this param yet! just use default.
|
||||
if (parameter_->axis_ == -1) {
|
||||
|
@ -101,7 +101,7 @@ int SoftmaxOpenCLKernel::Init() {
|
|||
// support 2d tensor
|
||||
kernel_name += "1x1";
|
||||
program_name += "1x1";
|
||||
source = softmax1x1_source_fp32;
|
||||
source = softmax1x1_source;
|
||||
onexone_flag_ = true;
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Init `Softmax` kernel failed: Unsupported axis: " << parameter_->axis_;
|
||||
|
|
|
@ -22,7 +22,7 @@
|
|||
#include "include/errorcode.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/to_format.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/to_format.cl.inc"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
|
@ -37,10 +37,9 @@ int ToFormatOpenCLKernel::Init() {
|
|||
auto parameter = reinterpret_cast<OpenCLToFormatParameter *>(op_parameter_);
|
||||
out_mem_type_ = parameter->out_mem_type;
|
||||
std::string program_name = "to_format";
|
||||
std::map<schema::Format, std::string> format_str{{schema::Format_NCHW, "NCHW"},
|
||||
{schema::Format_NHWC, "NHWC"},
|
||||
{schema::Format_NC4HW4, "NC4HW4"},
|
||||
{schema::Format_NHWC4, "NHWC4"}};
|
||||
std::map<schema::Format, std::string> format_str{{schema::Format_NCHW, "NCHW"}, {schema::Format_NHWC, "NHWC"},
|
||||
{schema::Format_NC4HW4, "NC4HW4"}, {schema::Format_NC4, "NHWC4"},
|
||||
{schema::Format_NC, "NHWC"}, {schema::Format_NHWC4, "NHWC4"}};
|
||||
std::string kernel_name =
|
||||
"to_format_" + format_str[in_tensors_[0]->GetFormat()] + "_to_" + format_str[out_tensors_[0]->GetFormat()];
|
||||
if (out_mem_type_ == OpenCLMemType::IMG) {
|
||||
|
@ -49,49 +48,54 @@ int ToFormatOpenCLKernel::Init() {
|
|||
kernel_name += "_BUF";
|
||||
}
|
||||
|
||||
this->set_name(kernel_name);
|
||||
#ifdef PROGRAM_WITH_IL
|
||||
ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
|
||||
#else
|
||||
std::set<std::string> build_options;
|
||||
#ifdef ENABLE_FP16
|
||||
std::string source = to_format_source_fp16;
|
||||
#else
|
||||
std::string source = to_format_source_fp32;
|
||||
#endif
|
||||
std::string source = to_format_source;
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
#endif
|
||||
InitNHWCShape();
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ToFormatOpenCLKernel::InitNHWCShape() {
|
||||
std::vector<int> shapex = out_tensors_[0]->shape();
|
||||
size_t n, h, w, c;
|
||||
if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4 || out_tensors_[0]->GetFormat() == schema::Format_NHWC) {
|
||||
n = shapex[0];
|
||||
h = shapex[1];
|
||||
w = shapex[2];
|
||||
c = shapex[3];
|
||||
} else if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4 ||
|
||||
out_tensors_[0]->GetFormat() == schema::Format_NCHW) {
|
||||
n = shapex[0];
|
||||
h = shapex[2];
|
||||
w = shapex[3];
|
||||
c = shapex[1];
|
||||
} else if (out_tensors_[0]->GetFormat() == schema::Format_NC4 || out_tensors_[0]->GetFormat() == schema::Format_NC) {
|
||||
n = shapex[0];
|
||||
h = 1;
|
||||
w = 1;
|
||||
c = shapex[1];
|
||||
} else {
|
||||
n = shapex[0];
|
||||
h = shapex[1];
|
||||
w = shapex[2];
|
||||
c = shapex[3];
|
||||
}
|
||||
nhwc_shape_ = {n, h, w, c};
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ToFormatOpenCLKernel::ReSize() { return RET_OK; }
|
||||
|
||||
int ToFormatOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *global_size) {
|
||||
std::vector<int> shapex = out_tensors_[0]->shape();
|
||||
if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4 || out_tensors_[0]->GetFormat() == schema::Format_NHWC) {
|
||||
int h = shapex[1];
|
||||
int w = shapex[2];
|
||||
int c = shapex[3];
|
||||
int c4 = UP_DIV(c, C4NUM);
|
||||
std::vector<size_t> vec = {(size_t)h, (size_t)w, (size_t)c4};
|
||||
*global_size = std::move(vec);
|
||||
} else if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4 ||
|
||||
out_tensors_[0]->GetFormat() == schema::Format_NCHW) {
|
||||
int h = shapex[2];
|
||||
int w = shapex[3];
|
||||
int c = shapex[1];
|
||||
int c4 = UP_DIV(c, C4NUM);
|
||||
std::vector<size_t> vec = {(size_t)c4, (size_t)h, (size_t)w};
|
||||
*global_size = std::move(vec);
|
||||
} else if (out_tensors_[0]->GetFormat() == out_tensors_[0]->GetFormat() == schema::Format_NCHW) {
|
||||
int h = shapex[2];
|
||||
int w = shapex[3];
|
||||
int c = shapex[1];
|
||||
int w4 = UP_DIV(w, C4NUM);
|
||||
std::vector<size_t> vec = {(size_t)w4, (size_t)h, (size_t)c};
|
||||
*global_size = std::move(vec);
|
||||
}
|
||||
std::vector<size_t> vec = {nhwc_shape_[1], nhwc_shape_[2], UP_DIV(nhwc_shape_[3], C4NUM)};
|
||||
*global_size = std::move(vec);
|
||||
return RET_OK;
|
||||
}
|
||||
int ToFormatOpenCLKernel::GetLocalSize(size_t idx, const std::vector<size_t> &global_size,
|
||||
|
@ -114,6 +118,12 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size
|
|||
int c = shapex[3];
|
||||
im_dst_x = w * UP_DIV(c, C4NUM);
|
||||
im_dst_y = h;
|
||||
} else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
|
||||
int h = 1;
|
||||
int w = 1;
|
||||
int c = shapex[1];
|
||||
im_dst_x = w * UP_DIV(c, C4NUM);
|
||||
im_dst_y = h;
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Unsupported format. " << out_tensors_[0]->GetFormat();
|
||||
}
|
||||
|
@ -128,15 +138,13 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size
|
|||
return RET_OK;
|
||||
}
|
||||
int ToFormatOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << "ToFormat"
|
||||
<< " Running!";
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
std::vector<size_t> local = {};
|
||||
std::vector<size_t> global;
|
||||
GetGlobalSize(0, &global);
|
||||
auto shapex = in_tensors_[0]->shape();
|
||||
cl_int4 shape{shapex.size() > 0 ? shapex[0] : 1, shapex.size() > 1 ? shapex[1] : 1, shapex.size() > 2 ? shapex[2] : 1,
|
||||
shapex.size() > 3 ? shapex[3] : 1};
|
||||
|
||||
cl_int4 shape{(cl_int)nhwc_shape_[0], (cl_int)nhwc_shape_[1], (cl_int)nhwc_shape_[2], (cl_int)nhwc_shape_[3]};
|
||||
cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1};
|
||||
ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->Data());
|
||||
|
|
|
@ -37,9 +37,11 @@ class ToFormatOpenCLKernel : public OpenCLKernel {
|
|||
int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
|
||||
int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) override;
|
||||
int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) override;
|
||||
int InitNHWCShape();
|
||||
|
||||
private:
|
||||
cl::Kernel kernel_;
|
||||
std::vector<size_t> nhwc_shape_;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -21,8 +21,7 @@
|
|||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/transpose.h"
|
||||
#ifndef PROGRAM_WITH_IL
|
||||
#include "src/runtime/kernel/opencl/cl/fp16/transpose.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/transpose.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/transpose.cl.inc"
|
||||
#endif
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
|
@ -45,11 +44,7 @@ int TransposeOpenCLKernel::Init() {
|
|||
ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
|
||||
#else
|
||||
std::set<std::string> build_options;
|
||||
#ifdef ENABLE_FP16
|
||||
std::string source = transpose_source_fp16;
|
||||
#else
|
||||
std::string source = transpose_source_fp32;
|
||||
#endif
|
||||
std::string source = transpose_source;
|
||||
std::string program_name = "transpose";
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
|
|
|
@ -288,13 +288,13 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
|
|||
if (fp16_enable_) {
|
||||
// fp16 enable, kernel will use half and read_imageh and write_imageh.
|
||||
build_options_str =
|
||||
"-DFLOAT=half -DFLOAT4=half4 -DRI_F=read_imageh "
|
||||
"-DWI_F=write_imageh";
|
||||
"-DFLT=half -DFLT4=half4 -DFLT16=half16 "
|
||||
"-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT4=convert_half4";
|
||||
} else {
|
||||
// fp16 not enable, kernel will use float and read_imagef and write_imagef.
|
||||
build_options_str =
|
||||
"-DFLOAT=float -DFLOAT4=float4 -DRI_F=read_imagef "
|
||||
"-DWI_F=write_imagef";
|
||||
"-DFLT=float -DFLT4=float4 -DFLT16=float16 "
|
||||
"-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT4=convert_float4";
|
||||
}
|
||||
|
||||
build_options_str = std::accumulate(
|
||||
|
|
Loading…
Reference in New Issue