forked from mindspore-Ecosystem/mindspore
!5614 [MS][LITE][GPU]fix transpose softmax reshape bug
Merge pull request !5614 from chenzupeng/master-lite
This commit is contained in:
commit
8f106d685a
|
@ -1,11 +1,14 @@
|
||||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||||
__kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
|
__kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, int4 size_out) {
|
||||||
int X = get_global_id(0);
|
int X = get_global_id(0);
|
||||||
int Y = get_global_id(1);
|
int Y = get_global_id(1);
|
||||||
int Z = get_global_id(2);
|
int Z = get_global_id(2);
|
||||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
if (X >= size_out.x || Y >= size_out.y || Z >= size_out.z) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
|
int out_index = X * size_out.y + Y;
|
||||||
|
int ih = out_index / size.y;
|
||||||
|
int iw = out_index % size.y;
|
||||||
|
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(iw * size.z + Z, ih)));
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,16 +1,21 @@
|
||||||
__kernel void SoftMax_BUF(__global float4 *input, __global float4 *output, const int4 input_shape) {
|
#ifdef cl_khr_fp16
|
||||||
int X = get_global_id(0);
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
int Y = get_global_id(1);
|
#endif
|
||||||
|
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
||||||
|
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||||
|
__kernel void SoftMax_BUF(__read_only image2d_t input, __global FLT4 *output, const int4 input_shape) {
|
||||||
|
int X = get_global_id(0); // H
|
||||||
|
int Y = get_global_id(1); // W
|
||||||
int H = input_shape.x;
|
int H = input_shape.x;
|
||||||
int W = input_shape.y;
|
int W = input_shape.y;
|
||||||
int C = input_shape.z;
|
int C = input_shape.z;
|
||||||
int S = input_shape.w;
|
int S = input_shape.w;
|
||||||
|
|
||||||
if (X >= W || Y >= H) return;
|
if (X >= H || Y >= W) return;
|
||||||
|
|
||||||
float sum = 0.0f;
|
FLT sum = 0.0f;
|
||||||
for (int d = 0; d < S; ++d) {
|
for (int d = 0; d < S; ++d) {
|
||||||
float4 t = input[(Y * W + X * H) * C + d];
|
FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X));
|
||||||
sum += exp(t.x);
|
sum += exp(t.x);
|
||||||
if (d * 4 + 1 < C) sum += exp(t.y);
|
if (d * 4 + 1 < C) sum += exp(t.y);
|
||||||
if (d * 4 + 2 < C) sum += exp(t.z);
|
if (d * 4 + 2 < C) sum += exp(t.z);
|
||||||
|
@ -18,15 +23,17 @@ __kernel void SoftMax_BUF(__global float4 *input, __global float4 *output, const
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int d = 0; d < S; ++d) {
|
for (int d = 0; d < S; ++d) {
|
||||||
float4 t = input[(Y * W + X * H) * C + d];
|
FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X));
|
||||||
t = exp(t) / sum;
|
t = exp(t) / sum;
|
||||||
float4 result = convert_float4(t);
|
__global FLT *output_flt = (__global FLT *)output;
|
||||||
output[(Y * W + X * H) * C + d] = result;
|
output_flt += (X * W + Y) * C + 4 * d;
|
||||||
|
output_flt[0] = t.x;
|
||||||
|
if (d * 4 + 1 < C) output_flt[1] += t.y;
|
||||||
|
if (d * 4 + 2 < C) output_flt[2] += t.z;
|
||||||
|
if (d * 4 + 3 < C) output_flt[3] += t.w;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
|
||||||
|
|
||||||
__kernel void SoftMax_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) {
|
__kernel void SoftMax_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) {
|
||||||
int X = get_global_id(0);
|
int X = get_global_id(0);
|
||||||
int Y = get_global_id(1);
|
int Y = get_global_id(1);
|
||||||
|
@ -34,7 +41,7 @@ __kernel void SoftMax_IMG(__read_only image2d_t input, __write_only image2d_t ou
|
||||||
|
|
||||||
float sum = 0.0f;
|
float sum = 0.0f;
|
||||||
for (int d = 0; d < input_shape.w; ++d) {
|
for (int d = 0; d < input_shape.w; ++d) {
|
||||||
float4 t = read_imagef(input, smp_none, (int2)(Y * input_shape.w + d, X));
|
FLT4 t = READ_IMAGE(input, smp_none, (int2)(Y * input_shape.w + d, X));
|
||||||
sum += exp(t.x);
|
sum += exp(t.x);
|
||||||
if (d * 4 + 1 < input_shape.z) sum += exp(t.y);
|
if (d * 4 + 1 < input_shape.z) sum += exp(t.y);
|
||||||
if (d * 4 + 2 < input_shape.z) sum += exp(t.z);
|
if (d * 4 + 2 < input_shape.z) sum += exp(t.z);
|
||||||
|
@ -42,9 +49,112 @@ __kernel void SoftMax_IMG(__read_only image2d_t input, __write_only image2d_t ou
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int d = 0; d < input_shape.w; ++d) {
|
for (int d = 0; d < input_shape.w; ++d) {
|
||||||
float4 t = read_imagef(input, smp_none, (int2)(Y * input_shape.w + d, X));
|
FLT4 t = READ_IMAGE(input, smp_none, (int2)(Y * input_shape.w + d, X));
|
||||||
t = exp(t) / sum;
|
t = exp(t) / sum;
|
||||||
float4 result = convert_float4(t);
|
FLT4 result = TO_FLT4(t);
|
||||||
write_imagef(output, (int2)(Y * input_shape.w + d, X), result);
|
WRITE_IMAGE(output, (int2)(Y * input_shape.w + d, X), result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t output, const FLT4 mask,
|
||||||
|
const int slices, const int slices_x32) {
|
||||||
|
int tid = get_local_id(0);
|
||||||
|
int slices_count = 0;
|
||||||
|
int offset = 0;
|
||||||
|
FLT sum = 0.0f;
|
||||||
|
do {
|
||||||
|
int z = offset + tid;
|
||||||
|
if (z < slices) {
|
||||||
|
FLT4 mask_temp = z == slices - 1 ? mask : (FLT4)(1.0f);
|
||||||
|
FLT4 src = READ_IMAGE(input, smp_none, (int2)(0, 0));
|
||||||
|
sum += dot(mask_temp, exp(src));
|
||||||
|
offset += 32;
|
||||||
|
}
|
||||||
|
slices_count++;
|
||||||
|
} while (slices_count < slices_x32);
|
||||||
|
|
||||||
|
__local FLT4 tmp[8];
|
||||||
|
__local FLT *tmpx1 = (__local FLT *)tmp;
|
||||||
|
tmpx1[tid] = sum;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
if (tid == 0) {
|
||||||
|
sum = dot((FLT4)(1.0f), tmp[0]);
|
||||||
|
sum += dot((FLT4)(1.0f), tmp[1]);
|
||||||
|
sum += dot((FLT4)(1.0f), tmp[2]);
|
||||||
|
sum += dot((FLT4)(1.0f), tmp[3]);
|
||||||
|
sum += dot((FLT4)(1.0f), tmp[4]);
|
||||||
|
sum += dot((FLT4)(1.0f), tmp[5]);
|
||||||
|
sum += dot((FLT4)(1.0f), tmp[6]);
|
||||||
|
sum += dot((FLT4)(1.0f), tmp[7]);
|
||||||
|
tmpx1[0] = 1.0f / sum;
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
sum = tmpx1[0];
|
||||||
|
|
||||||
|
offset = 0;
|
||||||
|
slices_count = 0;
|
||||||
|
do {
|
||||||
|
int z = offset + tid;
|
||||||
|
if (z < slices) {
|
||||||
|
FLT4 res = TO_FLT4(exp(READ_IMAGE(input, smp_none, (int2)(0, 0))) * sum);
|
||||||
|
WRITE_IMAGE(output, (int2)(0, 0), res);
|
||||||
|
offset += 32;
|
||||||
|
}
|
||||||
|
slices_count++;
|
||||||
|
} while (slices_count < slices_x32);
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask, const int slices,
|
||||||
|
const int slices_x32) {
|
||||||
|
int tid = get_local_id(0);
|
||||||
|
FLT sum = 0.0f;
|
||||||
|
for (size_t i = tid; i < slices - 1; i += 32) {
|
||||||
|
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(i, 0));
|
||||||
|
sum += dot((FLT4)(1.0f), exp(src));
|
||||||
|
}
|
||||||
|
if ((slices - 1) % 32 == tid) {
|
||||||
|
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0));
|
||||||
|
|
||||||
|
sum += dot(TO_FLT4(mask), exp(src));
|
||||||
|
}
|
||||||
|
|
||||||
|
__local FLT4 tmp[8];
|
||||||
|
__local FLT *tmpx1 = (__local FLT *)tmp;
|
||||||
|
tmpx1[tid] = sum;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
if (tid == 0) {
|
||||||
|
sum = dot((FLT4)(1.0f), tmp[0]);
|
||||||
|
sum += dot((FLT4)(1.0f), tmp[1]);
|
||||||
|
sum += dot((FLT4)(1.0f), tmp[2]);
|
||||||
|
sum += dot((FLT4)(1.0f), tmp[3]);
|
||||||
|
sum += dot((FLT4)(1.0f), tmp[4]);
|
||||||
|
sum += dot((FLT4)(1.0f), tmp[5]);
|
||||||
|
sum += dot((FLT4)(1.0f), tmp[6]);
|
||||||
|
sum += dot((FLT4)(1.0f), tmp[7]);
|
||||||
|
tmpx1[0] = 1.0f / sum;
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
sum = tmpx1[0];
|
||||||
|
for (size_t i = tid; i < slices - 1; i += 32) {
|
||||||
|
FLT4 result = READ_IMAGE(input, smp_zero, (int2)(i, 0));
|
||||||
|
result = exp(result) * sum;
|
||||||
|
output[i] = result;
|
||||||
|
}
|
||||||
|
if ((slices - 1) % 32 == tid) {
|
||||||
|
FLT4 result = READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0));
|
||||||
|
result = exp(result) * sum;
|
||||||
|
__global FLT4 *remain_ptr4 = output;
|
||||||
|
remain_ptr4 += slices - 1;
|
||||||
|
__global FLT *remain_ptr = (__global FLT *)remain_ptr4;
|
||||||
|
remain_ptr[0] = result.x;
|
||||||
|
if (mask.y > 0.f) {
|
||||||
|
remain_ptr[1] = result.y;
|
||||||
|
}
|
||||||
|
if (mask.z > 0.f) {
|
||||||
|
remain_ptr[2] = result.z;
|
||||||
|
}
|
||||||
|
if (mask.w > 0.f) {
|
||||||
|
remain_ptr[3] = result.w;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,104 +0,0 @@
|
||||||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
|
||||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
|
||||||
// what is mask and args.slices_x32
|
|
||||||
__kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask,
|
|
||||||
const int slices, const int slices_x32) {
|
|
||||||
int tid = get_local_id(0);
|
|
||||||
int slices_count = 0;
|
|
||||||
int offset = 0;
|
|
||||||
float sum = 0.0f;
|
|
||||||
do {
|
|
||||||
int z = offset + tid;
|
|
||||||
if (z < slices) {
|
|
||||||
float4 mask_temp = z == slices - 1 ? mask : (float4)(1.0f);
|
|
||||||
float4 src = read_imagef(input, smp_none, (int2)(0, 0));
|
|
||||||
sum += dot(mask_temp, exp(src));
|
|
||||||
offset += 32;
|
|
||||||
}
|
|
||||||
slices_count++;
|
|
||||||
} while (slices_count < slices_x32);
|
|
||||||
|
|
||||||
__local float4 tmp[8];
|
|
||||||
__local float *tmpx1 = (__local float *)tmp;
|
|
||||||
tmpx1[tid] = sum;
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
if (tid == 0) {
|
|
||||||
sum = dot((float4)(1.0f), tmp[0]);
|
|
||||||
sum += dot((float4)(1.0f), tmp[1]);
|
|
||||||
sum += dot((float4)(1.0f), tmp[2]);
|
|
||||||
sum += dot((float4)(1.0f), tmp[3]);
|
|
||||||
sum += dot((float4)(1.0f), tmp[4]);
|
|
||||||
sum += dot((float4)(1.0f), tmp[5]);
|
|
||||||
sum += dot((float4)(1.0f), tmp[6]);
|
|
||||||
sum += dot((float4)(1.0f), tmp[7]);
|
|
||||||
tmpx1[0] = 1.0f / sum;
|
|
||||||
}
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
sum = tmpx1[0];
|
|
||||||
|
|
||||||
offset = 0;
|
|
||||||
slices_count = 0;
|
|
||||||
do {
|
|
||||||
int z = offset + tid;
|
|
||||||
if (z < slices) {
|
|
||||||
float4 res = convert_float4(exp(read_imagef(input, smp_none, (int2)(0, 0))) * sum);
|
|
||||||
write_imagef(output, (int2)(0, 0), res);
|
|
||||||
offset += 32;
|
|
||||||
}
|
|
||||||
slices_count++;
|
|
||||||
} while (slices_count < slices_x32);
|
|
||||||
}
|
|
||||||
|
|
||||||
__kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *output, const float4 mask, const int slices,
|
|
||||||
const int slices_x32) {
|
|
||||||
int tid = get_local_id(0);
|
|
||||||
float sum = 0.0f;
|
|
||||||
for (size_t i = tid; i < slices - 1; i += 32) {
|
|
||||||
float4 src = read_imagef(input, smp_zero, (int2)(i, 0));
|
|
||||||
sum += dot((float4)(1.0f), exp(src));
|
|
||||||
}
|
|
||||||
if ((slices - 1) % 32 == tid) {
|
|
||||||
float4 src = read_imagef(input, smp_zero, (int2)(slices - 1, 0));
|
|
||||||
sum += dot(mask, exp(src));
|
|
||||||
}
|
|
||||||
|
|
||||||
__local float4 tmp[8];
|
|
||||||
__local float *tmpx1 = (__local float *)tmp;
|
|
||||||
tmpx1[tid] = sum;
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
if (tid == 0) {
|
|
||||||
sum = dot((float4)(1.0f), tmp[0]);
|
|
||||||
sum += dot((float4)(1.0f), tmp[1]);
|
|
||||||
sum += dot((float4)(1.0f), tmp[2]);
|
|
||||||
sum += dot((float4)(1.0f), tmp[3]);
|
|
||||||
sum += dot((float4)(1.0f), tmp[4]);
|
|
||||||
sum += dot((float4)(1.0f), tmp[5]);
|
|
||||||
sum += dot((float4)(1.0f), tmp[6]);
|
|
||||||
sum += dot((float4)(1.0f), tmp[7]);
|
|
||||||
tmpx1[0] = 1.0f / sum;
|
|
||||||
}
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
sum = tmpx1[0];
|
|
||||||
for (size_t i = tid; i < slices - 1; i += 32) {
|
|
||||||
float4 result = read_imagef(input, smp_zero, (int2)(i, 0));
|
|
||||||
result = exp(result) * sum;
|
|
||||||
output[i] = result;
|
|
||||||
}
|
|
||||||
if ((slices - 1) % 32 == tid) {
|
|
||||||
float4 result = read_imagef(input, smp_zero, (int2)(slices - 1, 0));
|
|
||||||
result = exp(result) * sum;
|
|
||||||
__global float4 *remain_ptr4 = output;
|
|
||||||
remain_ptr4 += slices - 1;
|
|
||||||
__global float *remain_ptr = (__global float *)remain_ptr4;
|
|
||||||
remain_ptr[0] = result.x;
|
|
||||||
if (mask.y > 0.f) {
|
|
||||||
remain_ptr[1] = result.y;
|
|
||||||
}
|
|
||||||
if (mask.z > 0.f) {
|
|
||||||
remain_ptr[2] = result.z;
|
|
||||||
}
|
|
||||||
if (mask.w > 0.f) {
|
|
||||||
remain_ptr[3] = result.w;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
#ifdef cl_khr_fp16
|
||||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
#endif
|
||||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||||
__kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
|
__kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
|
||||||
int X = get_global_id(0);
|
int X = get_global_id(0);
|
||||||
|
@ -41,7 +43,7 @@ __kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d
|
||||||
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]);
|
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void transpose_BUF(__read_only image2d_t src_data, global FLT4 *dst_data, int2 HW, int2 C) {
|
__kernel void transpose_BUF(__read_only image2d_t src_data, global FLT4 *dst_data, int2 HW, int2 C, int W) {
|
||||||
int X = get_global_id(0);
|
int X = get_global_id(0);
|
||||||
int Y = get_global_id(1);
|
int Y = get_global_id(1);
|
||||||
if (X >= HW.y || Y >= C.y) {
|
if (X >= HW.y || Y >= C.y) {
|
||||||
|
@ -52,10 +54,10 @@ __kernel void transpose_BUF(__read_only image2d_t src_data, global FLT4 *dst_dat
|
||||||
result[1] = (FLT4)(0.0f);
|
result[1] = (FLT4)(0.0f);
|
||||||
result[2] = (FLT4)(0.0f);
|
result[2] = (FLT4)(0.0f);
|
||||||
result[3] = (FLT4)(0.0f);
|
result[3] = (FLT4)(0.0f);
|
||||||
FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X));
|
FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X) % W * C.y + Y, (4 * X) / W));
|
||||||
FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 1));
|
FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 1) % W * C.y + Y, (4 * X + 1) / W));
|
||||||
FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 2));
|
FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 2) % W * C.y + Y, (4 * X + 2) / W));
|
||||||
FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 3));
|
FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 3) % W * C.y + Y, (4 * X + 3) / W));
|
||||||
result[0].x = x0.x;
|
result[0].x = x0.x;
|
||||||
result[0].y = x1.x;
|
result[0].y = x1.x;
|
||||||
result[0].z = x2.x;
|
result[0].z = x2.x;
|
||||||
|
|
|
@ -65,7 +65,8 @@ int PoolingOpenCLKernel::Init() {
|
||||||
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
|
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
|
||||||
#else
|
#else
|
||||||
if (out_mem_type_ == OpenCLMemType::BUF) {
|
if (out_mem_type_ == OpenCLMemType::BUF) {
|
||||||
kernel_name += "_BUF";
|
MS_LOG(ERROR) << "buffer output not support yet.";
|
||||||
|
return RET_ERROR;
|
||||||
} else {
|
} else {
|
||||||
kernel_name += "_IMG";
|
kernel_name += "_IMG";
|
||||||
}
|
}
|
||||||
|
|
|
@ -68,10 +68,16 @@ int ReshapeOpenCLKernel::ReSize() { return RET_OK; }
|
||||||
|
|
||||||
int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
|
int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
|
||||||
size_t im_dst_x, im_dst_y;
|
size_t im_dst_x, im_dst_y;
|
||||||
std::vector<int> shapex = in_tensors_[0]->shape();
|
std::vector<int> shapex = out_tensors_[0]->shape();
|
||||||
int h = shapex[1];
|
int h, w, c;
|
||||||
int w = shapex[2];
|
if (shapex.size() == 2) {
|
||||||
int c = shapex[3];
|
h = w = 1;
|
||||||
|
c = shapex[1];
|
||||||
|
} else {
|
||||||
|
h = shapex[1];
|
||||||
|
w = shapex[2];
|
||||||
|
c = shapex[3];
|
||||||
|
}
|
||||||
im_dst_x = w * UP_DIV(c, C4NUM);
|
im_dst_x = w * UP_DIV(c, C4NUM);
|
||||||
im_dst_y = h;
|
im_dst_y = h;
|
||||||
size_t img_dtype = CL_FLOAT;
|
size_t img_dtype = CL_FLOAT;
|
||||||
|
@ -91,13 +97,23 @@ int ReshapeOpenCLKernel::Run() {
|
||||||
int w = shapex[2];
|
int w = shapex[2];
|
||||||
int c = shapex[3];
|
int c = shapex[3];
|
||||||
int c4 = UP_DIV(c, C4NUM);
|
int c4 = UP_DIV(c, C4NUM);
|
||||||
|
int oh, ow;
|
||||||
|
if (out_tensors_[0]->shape().size() == 2) {
|
||||||
|
oh = ow = 1;
|
||||||
|
} else {
|
||||||
|
oh = out_tensors_[0]->shape()[1];
|
||||||
|
ow = out_tensors_[0]->shape()[2];
|
||||||
|
}
|
||||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||||
std::vector<size_t> local = {};
|
std::vector<size_t> local = {};
|
||||||
std::vector<size_t> global = {(size_t)h, (size_t)w, (size_t)c4};
|
std::vector<size_t> global = {(size_t)oh, (size_t)ow, (size_t)c4};
|
||||||
cl_int4 size = {h, w, c4, 1};
|
cl_int4 size = {h, w, c4, 1};
|
||||||
ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data());
|
cl_int4 size_out = {oh, ow, c4, 1};
|
||||||
ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->Data());
|
int arg_idx = 0;
|
||||||
ocl_runtime->SetKernelArg(kernel_, 2, size);
|
ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->Data());
|
||||||
|
ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data());
|
||||||
|
ocl_runtime->SetKernelArg(kernel_, arg_idx++, size);
|
||||||
|
ocl_runtime->SetKernelArg(kernel_, arg_idx++, size_out);
|
||||||
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,7 +23,6 @@
|
||||||
#include "src/runtime/kernel/opencl/utils.h"
|
#include "src/runtime/kernel/opencl/utils.h"
|
||||||
#ifndef PROGRAM_WITH_IL
|
#ifndef PROGRAM_WITH_IL
|
||||||
#include "src/runtime/kernel/opencl/cl/softmax.cl.inc"
|
#include "src/runtime/kernel/opencl/cl/softmax.cl.inc"
|
||||||
#include "src/runtime/kernel/opencl/cl/softmax1x1.cl.inc"
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||||
|
@ -42,8 +41,8 @@ std::vector<float> SoftmaxOpenCLKernel::GetMaskForLastChannel(int channels) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int SoftmaxOpenCLKernel::InitGlobalSize() {
|
int SoftmaxOpenCLKernel::InitGlobalSize() {
|
||||||
const size_t global_x = out_tensors_[0]->Height();
|
const size_t global_x = out_tensors_[0]->shape()[1];
|
||||||
const size_t global_y = out_tensors_[0]->Width();
|
const size_t global_y = out_tensors_[0]->shape()[2];
|
||||||
const size_t global_z = 1;
|
const size_t global_z = 1;
|
||||||
global_size_ = {global_x, global_y, global_z};
|
global_size_ = {global_x, global_y, global_z};
|
||||||
return lite::RET_OK;
|
return lite::RET_OK;
|
||||||
|
@ -74,11 +73,10 @@ int SoftmaxOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
|
||||||
im_dst_x = out_tensors_[0]->Width() * CO4;
|
im_dst_x = out_tensors_[0]->Width() * CO4;
|
||||||
im_dst_y = out_tensors_[0]->Height();
|
im_dst_y = out_tensors_[0]->Height();
|
||||||
}
|
}
|
||||||
#ifdef ENABLE_FP16
|
|
||||||
size_t img_dtype = CL_HALF_FLOAT;
|
|
||||||
#else
|
|
||||||
size_t img_dtype = CL_FLOAT;
|
size_t img_dtype = CL_FLOAT;
|
||||||
#endif
|
if (enable_fp16_) {
|
||||||
|
img_dtype = CL_HALF_FLOAT;
|
||||||
|
}
|
||||||
img_size->clear();
|
img_size->clear();
|
||||||
std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
|
std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
|
||||||
*img_size = vec;
|
*img_size = vec;
|
||||||
|
@ -90,27 +88,28 @@ int SoftmaxOpenCLKernel::Init() {
|
||||||
std::string program_name = "SoftMax";
|
std::string program_name = "SoftMax";
|
||||||
std::string source = softmax_source;
|
std::string source = softmax_source;
|
||||||
runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
|
runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
|
||||||
|
enable_fp16_ = runtime_->GetFp16Enable();
|
||||||
// framework not set this param yet! just use default.
|
// framework not set this param yet! just use default.
|
||||||
if (parameter_->axis_ == -1) {
|
if (in_tensors_[0]->shape().size() == 4) {
|
||||||
parameter_->axis_ = 1;
|
|
||||||
}
|
|
||||||
if (in_tensors_[0]->shape().size() == 4 && parameter_->axis_ == 3) {
|
|
||||||
// support 4d tensor
|
// support 4d tensor
|
||||||
onexone_flag_ = false;
|
onexone_flag_ = false;
|
||||||
} else if (in_tensors_[0]->shape().size() == 2 && parameter_->axis_ == 1) {
|
} else if (in_tensors_[0]->shape().size() == 2) {
|
||||||
// support 2d tensor
|
// support 2d tensor
|
||||||
kernel_name += "1x1";
|
kernel_name += "1x1";
|
||||||
program_name += "1x1";
|
program_name += "1x1";
|
||||||
source = softmax1x1_source;
|
|
||||||
onexone_flag_ = true;
|
onexone_flag_ = true;
|
||||||
} else {
|
} else {
|
||||||
MS_LOG(EXCEPTION) << "Init `Softmax` kernel failed: Unsupported axis: " << parameter_->axis_;
|
MS_LOG(ERROR) << "Init `Softmax` kernel failed: Unsupported shape size: " << in_tensors_[0]->shape().size();
|
||||||
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
#ifdef PROGRAM_WITH_IL
|
#ifdef PROGRAM_WITH_IL
|
||||||
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
|
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
|
||||||
#else
|
#else
|
||||||
if (!is_image_out_) {
|
if (!is_image_out_) {
|
||||||
out_mem_type_ = OpenCLMemType::BUF;
|
out_mem_type_ = OpenCLMemType::BUF;
|
||||||
|
} else {
|
||||||
|
MS_LOG(ERROR) << "image2d output not support yet.";
|
||||||
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
if (out_mem_type_ == OpenCLMemType::BUF) {
|
if (out_mem_type_ == OpenCLMemType::BUF) {
|
||||||
kernel_name += "_BUF";
|
kernel_name += "_BUF";
|
||||||
|
@ -124,12 +123,23 @@ int SoftmaxOpenCLKernel::Init() {
|
||||||
runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||||
#endif
|
#endif
|
||||||
in_ori_format_ = in_tensors_[0]->GetFormat();
|
in_ori_format_ = in_tensors_[0]->GetFormat();
|
||||||
in_tensors_[0]->SetFormat(schema::Format_NHWC4);
|
|
||||||
out_ori_format_ = out_tensors_[0]->GetFormat();
|
out_ori_format_ = out_tensors_[0]->GetFormat();
|
||||||
out_tensors_[0]->SetFormat(schema::Format_NHWC4);
|
if (in_tensors_[0]->shape().size() == 2) {
|
||||||
if (!is_image_out_) {
|
in_tensors_[0]->SetFormat(schema::Format_NC4);
|
||||||
out_ori_format_ = schema::Format_NC;
|
} else {
|
||||||
out_tensors_[0]->SetFormat(schema::Format_NC);
|
in_tensors_[0]->SetFormat(schema::Format_NHWC4);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_image_out_) {
|
||||||
|
if (out_tensors_[0]->shape().size() == 2) {
|
||||||
|
out_ori_format_ = schema::Format_NC;
|
||||||
|
out_tensors_[0]->SetFormat(schema::Format_NC4);
|
||||||
|
} else {
|
||||||
|
out_ori_format_ = schema::Format_NHWC;
|
||||||
|
out_tensors_[0]->SetFormat(schema::Format_NHWC4);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
out_tensors_[0]->SetFormat(out_ori_format_);
|
||||||
}
|
}
|
||||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||||
return lite::RET_OK;
|
return lite::RET_OK;
|
||||||
|
@ -147,17 +157,25 @@ int SoftmaxOpenCLKernel::Run() {
|
||||||
cl_float4 mask = {mask_[0], mask_[1], mask_[2], mask_[3]};
|
cl_float4 mask = {mask_[0], mask_[1], mask_[2], mask_[3]};
|
||||||
|
|
||||||
runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->Data());
|
runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->Data());
|
||||||
runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data());
|
if (is_image_out_) {
|
||||||
|
runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data());
|
||||||
|
} else {
|
||||||
|
runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data(), lite::opencl::MemType::BUF);
|
||||||
|
}
|
||||||
runtime_->SetKernelArg(kernel_, arg_idx++, mask);
|
runtime_->SetKernelArg(kernel_, arg_idx++, mask);
|
||||||
runtime_->SetKernelArg(kernel_, arg_idx++, slices);
|
runtime_->SetKernelArg(kernel_, arg_idx++, slices);
|
||||||
runtime_->SetKernelArg(kernel_, arg_idx, slices_x32);
|
runtime_->SetKernelArg(kernel_, arg_idx, slices_x32);
|
||||||
SetWorkGroupSize1x1();
|
SetWorkGroupSize1x1();
|
||||||
} else {
|
} else {
|
||||||
int slices = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
|
int slices = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
|
||||||
cl_int4 input_shape = {in_tensors_[0]->Height(), in_tensors_[0]->Width(), in_tensors_[0]->Channel(), slices};
|
cl_int4 input_shape = {in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], in_tensors_[0]->shape()[3], slices};
|
||||||
|
|
||||||
runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->Data());
|
runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->Data());
|
||||||
runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data());
|
if (is_image_out_) {
|
||||||
|
runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data());
|
||||||
|
} else {
|
||||||
|
runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data(), lite::opencl::MemType::BUF);
|
||||||
|
}
|
||||||
runtime_->SetKernelArg(kernel_, arg_idx, input_shape);
|
runtime_->SetKernelArg(kernel_, arg_idx, input_shape);
|
||||||
SetWorkGroupSize();
|
SetWorkGroupSize();
|
||||||
}
|
}
|
||||||
|
@ -193,4 +211,5 @@ kernel::LiteKernel *OpenCLSoftMaxKernelCreator(const std::vector<lite::tensor::T
|
||||||
}
|
}
|
||||||
|
|
||||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SoftMax, OpenCLSoftMaxKernelCreator)
|
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SoftMax, OpenCLSoftMaxKernelCreator)
|
||||||
|
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SoftMax, OpenCLSoftMaxKernelCreator)
|
||||||
} // namespace mindspore::kernel
|
} // namespace mindspore::kernel
|
||||||
|
|
|
@ -52,6 +52,7 @@ class SoftmaxOpenCLKernel : public OpenCLKernel {
|
||||||
std::vector<size_t> local_size_;
|
std::vector<size_t> local_size_;
|
||||||
std::vector<size_t> global_size_;
|
std::vector<size_t> global_size_;
|
||||||
bool is_image_out_{false};
|
bool is_image_out_{false};
|
||||||
|
bool enable_fp16_{false};
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace mindspore::kernel
|
} // namespace mindspore::kernel
|
||||||
|
|
|
@ -119,11 +119,9 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size
|
||||||
im_dst_x = w * UP_DIV(c, C4NUM);
|
im_dst_x = w * UP_DIV(c, C4NUM);
|
||||||
im_dst_y = h;
|
im_dst_y = h;
|
||||||
} else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
|
} else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
|
||||||
const int h = 1;
|
|
||||||
const int w = 1;
|
|
||||||
int c = shapex[1];
|
int c = shapex[1];
|
||||||
im_dst_x = w * UP_DIV(c, C4NUM);
|
im_dst_x = UP_DIV(c, C4NUM);
|
||||||
im_dst_y = h;
|
im_dst_y = 1;
|
||||||
} else {
|
} else {
|
||||||
MS_LOG(ERROR) << "Unsupported format. " << out_tensors_[0]->GetFormat();
|
MS_LOG(ERROR) << "Unsupported format. " << out_tensors_[0]->GetFormat();
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
|
|
|
@ -69,7 +69,7 @@ int TransposeOpenCLKernel::ReSize() { return RET_OK; }
|
||||||
|
|
||||||
int TransposeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
|
int TransposeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
|
||||||
size_t im_dst_x, im_dst_y;
|
size_t im_dst_x, im_dst_y;
|
||||||
im_dst_x = UP_DIV(out_tensors_[0]->Height() * out_tensors_[0]->Width(), C4NUM);
|
im_dst_x = out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Width(), C4NUM);
|
||||||
im_dst_y = out_tensors_[0]->Channel();
|
im_dst_y = out_tensors_[0]->Channel();
|
||||||
size_t img_dtype = CL_FLOAT;
|
size_t img_dtype = CL_FLOAT;
|
||||||
if (enable_fp16_) {
|
if (enable_fp16_) {
|
||||||
|
@ -96,10 +96,12 @@ int TransposeOpenCLKernel::Run() {
|
||||||
|
|
||||||
cl_int2 HW = {h * w, hw4};
|
cl_int2 HW = {h * w, hw4};
|
||||||
cl_int2 C = {c, c4};
|
cl_int2 C = {c, c4};
|
||||||
ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data());
|
int arg_idx = 0;
|
||||||
ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->Data());
|
ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->Data());
|
||||||
ocl_runtime->SetKernelArg(kernel_, 2, HW);
|
ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data());
|
||||||
ocl_runtime->SetKernelArg(kernel_, 3, C);
|
ocl_runtime->SetKernelArg(kernel_, arg_idx++, HW);
|
||||||
|
ocl_runtime->SetKernelArg(kernel_, arg_idx++, C);
|
||||||
|
ocl_runtime->SetKernelArg(kernel_, arg_idx++, w);
|
||||||
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
}
|
}
|
||||||
|
|
|
@ -86,14 +86,14 @@ void RunTestCaseReshape(const std::vector<int> &shape, void *input_data, void *o
|
||||||
inputs[0]->SetData(nullptr);
|
inputs[0]->SetData(nullptr);
|
||||||
outputs[0]->SetData(nullptr);
|
outputs[0]->SetData(nullptr);
|
||||||
|
|
||||||
MS_LOG(INFO) << "Test ReshapeFp32 passed";
|
MS_LOG(INFO) << "Test Reshape passed";
|
||||||
lite::opencl::OpenCLRuntime::DeleteInstance();
|
lite::opencl::OpenCLRuntime::DeleteInstance();
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(TestReshapeOpenCL, ReshapeFp32) {
|
TEST_F(TestReshapeOpenCL, ReshapeFp32) {
|
||||||
int c = 7;
|
int c = 7;
|
||||||
std::vector<int> shape = {c};
|
std::vector<int> shape = {c};
|
||||||
std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
|
std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
|
||||||
std::vector<float> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
|
std::vector<float> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
|
||||||
|
|
||||||
RunTestCaseReshape(shape, input_data.data(), output_data.data(), false);
|
RunTestCaseReshape(shape, input_data.data(), output_data.data(), false);
|
||||||
|
@ -102,7 +102,7 @@ TEST_F(TestReshapeOpenCL, ReshapeFp32) {
|
||||||
TEST_F(TestReshapeOpenCL, ReshapeFp16) {
|
TEST_F(TestReshapeOpenCL, ReshapeFp16) {
|
||||||
int c = 7;
|
int c = 7;
|
||||||
std::vector<int> shape = {c};
|
std::vector<int> shape = {c};
|
||||||
std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
|
std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
|
||||||
std::vector<float16_t> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
|
std::vector<float16_t> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
|
||||||
|
|
||||||
RunTestCaseReshape(shape, input_data.data(), output_data.data(), true);
|
RunTestCaseReshape(shape, input_data.data(), output_data.data(), true);
|
||||||
|
|
|
@ -17,94 +17,134 @@
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include "mindspore/core/utils/log_adapter.h"
|
#include "mindspore/core/utils/log_adapter.h"
|
||||||
#include "common/common_test.h"
|
#include "common/common_test.h"
|
||||||
|
#include "mindspore/lite/src/common/file_utils.h"
|
||||||
#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
|
#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
|
||||||
#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
|
#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
|
||||||
#include "mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h"
|
#include "mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h"
|
||||||
#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
|
#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
|
class TestSoftmaxOpenCL : public mindspore::CommonTest {
|
||||||
|
public:
|
||||||
|
TestSoftmaxOpenCL() {}
|
||||||
|
};
|
||||||
|
|
||||||
class TestSoftmaxOpenCL : public mindspore::CommonTest {};
|
void RunTestCaseSoftmax(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16) {
|
||||||
|
|
||||||
void RunTestCase(std::vector<int> input_shape, std::vector<int> output_shape, std::string input_file,
|
|
||||||
std::string expect_file, SoftmaxParameter *param, schema::Format format) {
|
|
||||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||||
ocl_runtime->Init();
|
ocl_runtime->Init();
|
||||||
|
size_t dtype_size = sizeof(float);
|
||||||
|
if (enable_fp16) {
|
||||||
|
ocl_runtime->SetFp16Enable(true);
|
||||||
|
dtype_size = sizeof(float16_t);
|
||||||
|
}
|
||||||
auto allocator = ocl_runtime->GetAllocator();
|
auto allocator = ocl_runtime->GetAllocator();
|
||||||
|
int n, h, w, c;
|
||||||
// define tensor
|
bool is_2d = false;
|
||||||
MS_LOG(INFO) << "defineTensor";
|
if (shape.size() == 2) {
|
||||||
auto data_type = kNumberTypeFloat32;
|
is_2d = true;
|
||||||
auto tensorType = schema::NodeType_ValueNode;
|
h = w = 1;
|
||||||
auto input_tensor = new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, format, tensorType);
|
n = shape[0];
|
||||||
auto output_tensor = new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, format, tensorType);
|
c = shape[1];
|
||||||
if (input_tensor == nullptr) {
|
} else {
|
||||||
MS_LOG(ERROR) << "input tensor null";
|
n = shape[0];
|
||||||
|
h = shape[1];
|
||||||
|
w = shape[2];
|
||||||
|
c = shape[3];
|
||||||
|
}
|
||||||
|
std::vector<int> input_shape = {n, h, w, c};
|
||||||
|
if (is_2d) {
|
||||||
|
input_shape = {n, c};
|
||||||
|
}
|
||||||
|
auto input_format = is_2d ? schema::Format_NC : schema::Format_NHWC;
|
||||||
|
auto input_dtype = enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32;
|
||||||
|
auto tensor_x_ptr = std::make_unique<lite::tensor::Tensor>(TypeId(input_dtype), input_shape, input_format);
|
||||||
|
auto tensor_x = tensor_x_ptr.get();
|
||||||
|
if (tensor_x == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "tensor_x create error.";
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (output_tensor == nullptr) {
|
auto tensor_out_ptr = std::make_unique<lite::tensor::Tensor>(TypeId(input_dtype), input_shape, input_format);
|
||||||
MS_LOG(ERROR) << "output tensor null";
|
auto tensor_out = tensor_out_ptr.get();
|
||||||
|
if (tensor_out == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "tensor_out create error.";
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
std::vector<lite::tensor::Tensor *> inputs{input_tensor};
|
std::vector<lite::tensor::Tensor *> inputs{tensor_x};
|
||||||
std::vector<lite::tensor::Tensor *> outputs{output_tensor};
|
std::vector<lite::tensor::Tensor *> outputs{tensor_out};
|
||||||
|
auto arith_kernel_ptr = std::make_unique<kernel::SoftmaxOpenCLKernel>(nullptr, inputs, outputs);
|
||||||
// run
|
auto arith_kernel = arith_kernel_ptr.get();
|
||||||
MS_LOG(INFO) << "NewOpenCLKernel";
|
if (arith_kernel == nullptr) {
|
||||||
auto *kernel = new kernel::SoftmaxOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
|
MS_LOG(ERROR) << "arith_kernel create error.";
|
||||||
if (kernel == nullptr) {
|
|
||||||
MS_LOG(ERROR) << "kernel null";
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
MS_LOG(INFO) << "KernelInit";
|
arith_kernel->Init();
|
||||||
kernel->Init();
|
|
||||||
|
|
||||||
std::vector<kernel::LiteKernel *> kernels{kernel};
|
|
||||||
inputs[0]->MallocData(allocator);
|
inputs[0]->MallocData(allocator);
|
||||||
auto *pGraph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
|
|
||||||
|
std::vector<kernel::LiteKernel *> kernels{arith_kernel};
|
||||||
|
auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs, outputs, kernels, kernels, kernels);
|
||||||
|
auto pGraph = pGraph_ptr.get();
|
||||||
if (pGraph == nullptr) {
|
if (pGraph == nullptr) {
|
||||||
MS_LOG(ERROR) << "pGraph null";
|
MS_LOG(ERROR) << "pGraph create error.";
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
MS_LOG(INFO) << "pGraphinit";
|
|
||||||
pGraph->Init();
|
pGraph->Init();
|
||||||
|
memcpy(inputs[0]->Data(), input_data, inputs[0]->ElementsNum() * dtype_size);
|
||||||
// load data
|
|
||||||
MS_LOG(INFO) << "load data1";
|
|
||||||
LoadTestData(input_tensor->Data(), input_tensor->Size(), input_file);
|
|
||||||
auto *input_data = reinterpret_cast<float *>(input_tensor->Data());
|
|
||||||
printf("\ninput[0:10]:");
|
|
||||||
for (int i = 0; i < 10; i++) {
|
|
||||||
printf("[%d]:%.3f ", i, input_data[i]);
|
|
||||||
}
|
|
||||||
printf("\n\n");
|
|
||||||
|
|
||||||
MS_LOG(INFO) << "Run";
|
|
||||||
pGraph->Run();
|
pGraph->Run();
|
||||||
|
|
||||||
MS_LOG(INFO) << "compare result";
|
if (enable_fp16) {
|
||||||
CompareOutput(output_tensor, expect_file, static_cast<float>(1e-5));
|
CompareOutput(outputs[0]->Data(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3), 2e-2);
|
||||||
for (auto tensor : inputs) {
|
} else {
|
||||||
delete tensor;
|
CompareOutput(outputs[0]->Data(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
|
||||||
}
|
}
|
||||||
for (auto tensor : outputs) {
|
inputs[0]->SetData(nullptr);
|
||||||
delete tensor;
|
outputs[0]->SetData(nullptr);
|
||||||
}
|
|
||||||
delete kernel;
|
MS_LOG(INFO) << "Test Softmax passed";
|
||||||
delete pGraph;
|
|
||||||
lite::opencl::OpenCLRuntime::DeleteInstance();
|
lite::opencl::OpenCLRuntime::DeleteInstance();
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(TestSoftmaxOpenCL, Softmax_1) {
|
TEST_F(TestSoftmaxOpenCL, Softmax2DFp32) {
|
||||||
std::vector<int> input_shape = {1, 2, 2, 8};
|
int n = 1;
|
||||||
std::vector<int> output_shape = {1, 2, 2, 8};
|
int c = 10;
|
||||||
std::string input_file = "softmax_in.bin";
|
std::vector<int> shape = {n, c};
|
||||||
std::string expect_file = "softmax_out.bin";
|
std::vector<float> input_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
|
||||||
auto param = new (std::nothrow) SoftmaxParameter;
|
std::vector<float> output_data = {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f};
|
||||||
param->axis_ = 3;
|
|
||||||
schema::Format format = schema::Format_NHWC4;
|
|
||||||
|
|
||||||
RunTestCase(input_shape, output_shape, input_file, expect_file, param, format);
|
RunTestCaseSoftmax(shape, input_data.data(), output_data.data(), false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(TestSoftmaxOpenCL, Softmax2DFp16) {
|
||||||
|
int n = 1;
|
||||||
|
int c = 10;
|
||||||
|
std::vector<int> shape = {n, c};
|
||||||
|
std::vector<float16_t> input_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
|
||||||
|
std::vector<float16_t> output_data = {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f};
|
||||||
|
|
||||||
|
RunTestCaseSoftmax(shape, input_data.data(), output_data.data(), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(TestSoftmaxOpenCL, Softmax4DFp32) {
|
||||||
|
int n = 1;
|
||||||
|
int h = 2;
|
||||||
|
int w = 1;
|
||||||
|
int c = 5;
|
||||||
|
std::vector<int> shape = {n, h, w, c};
|
||||||
|
std::vector<float> input_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
|
||||||
|
std::vector<float> output_data = {0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f};
|
||||||
|
|
||||||
|
RunTestCaseSoftmax(shape, input_data.data(), output_data.data(), false);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(TestSoftmaxOpenCL, Softmax4DFp16) {
|
||||||
|
int n = 1;
|
||||||
|
int h = 2;
|
||||||
|
int w = 1;
|
||||||
|
int c = 5;
|
||||||
|
std::vector<int> shape = {n, h, w, c};
|
||||||
|
std::vector<float16_t> input_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
|
||||||
|
std::vector<float16_t> output_data = {0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f};
|
||||||
|
|
||||||
|
RunTestCaseSoftmax(shape, input_data.data(), output_data.data(), true);
|
||||||
|
}
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -117,8 +117,8 @@ TEST_F(TestTransposeOpenCL, TransposeFp32) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(TestTransposeOpenCL, TransposeFp16) {
|
TEST_F(TestTransposeOpenCL, TransposeFp16) {
|
||||||
int h = 4;
|
int h = 2;
|
||||||
int w = 1;
|
int w = 2;
|
||||||
int c = 3;
|
int c = 3;
|
||||||
std::vector<int> shape = {h, w, c};
|
std::vector<int> shape = {h, w, c};
|
||||||
std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
|
std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
|
||||||
|
|
Loading…
Reference in New Issue