forked from mindspore-Ecosystem/mindspore
fix opencl infershape bug
This commit is contained in:
parent
7324ee14c6
commit
d4f49b3a10
|
@ -68,12 +68,14 @@ void *OpenCLAllocator::MinimumFit(MemType mem_type, size_t size, const ImageSize
|
|||
void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer) {
|
||||
cl_int ret = CL_SUCCESS;
|
||||
MS_ASSERT(buffer);
|
||||
MS_ASSERT(size > 0);
|
||||
*buffer = new (std::nothrow) cl::Buffer(*ocl_runtime_->Context(), static_cast<cl_mem_flags>(flags), size, data, &ret);
|
||||
if (*buffer == nullptr) {
|
||||
MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")";
|
||||
return nullptr;
|
||||
}
|
||||
void *host_ptr = ocl_runtime_->MapBuffer(**buffer, CL_MAP_READ | CL_MAP_WRITE, size);
|
||||
MS_ASSERT(host_ptr);
|
||||
if (host_ptr == nullptr) {
|
||||
delete *buffer;
|
||||
MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << *buffer << ", host_ptr=" << host_ptr;
|
||||
|
|
|
@ -1,450 +1,80 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void to_format_NHWC_to_NHWC4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size,
|
||||
int4 shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
|
||||
#define BUF_to_IMG(src_dtype, dst_dtype, SRC_FLT, DST_FLT, WRITE_IMAGE_OUT) \
|
||||
__kernel void BUF_to_IMG_##src_dtype##_##dst_dtype(__global float4 *src_data, __write_only image2d_t dst_data, \
|
||||
int4 size, int4 shape) { \
|
||||
int X = get_global_id(0); \
|
||||
int Y = get_global_id(1); \
|
||||
int Z = get_global_id(2); \
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) { \
|
||||
return; \
|
||||
} \
|
||||
DST_FLT##4 data = (DST_FLT##4)(0.f); \
|
||||
int offset = (X * shape.z + Y) * shape.w + Z * 4; \
|
||||
__global SRC_FLT *src_addr = (__global SRC_FLT *)src_data; \
|
||||
src_addr += offset; \
|
||||
if ((Z + 1) * 4 <= shape.w) { \
|
||||
data = convert_##DST_FLT##4(((__global SRC_FLT##4 *)src_addr)[0]); \
|
||||
} else { \
|
||||
if ((shape.w - Z * 4) >= 1) { \
|
||||
data.x = (DST_FLT)src_addr[0]; \
|
||||
} \
|
||||
if ((shape.w - Z * 4) >= 2) { \
|
||||
data.y = (DST_FLT)src_addr[1]; \
|
||||
} \
|
||||
if ((shape.w - Z * 4) >= 3) { \
|
||||
data.z = (DST_FLT)src_addr[2]; \
|
||||
} \
|
||||
} \
|
||||
if (size.y * size.z <= MAX_IMAGE2D_WIDTH) \
|
||||
WRITE_IMAGE_OUT(dst_data, (int2)(Y * size.z + Z, X), data); \
|
||||
else \
|
||||
WRITE_IMAGE_OUT(dst_data, (int2)(Z, X * size.y + Y), data); \
|
||||
}
|
||||
FLT4 data = (FLT4)(0.f);
|
||||
int offset = (X * shape.z + Y) * shape.w + Z * 4;
|
||||
__global float *src_addr = (__global float *)src_data;
|
||||
src_addr += offset;
|
||||
if ((Z + 1) * 4 <= shape.w) {
|
||||
data = TO_FLT4(((__global float4 *)src_addr)[0]);
|
||||
} else {
|
||||
if ((shape.w - Z * 4) >= 1) {
|
||||
data.x = (FLT)src_addr[0];
|
||||
}
|
||||
if ((shape.w - Z * 4) >= 2) {
|
||||
data.y = (FLT)src_addr[1];
|
||||
}
|
||||
if ((shape.w - Z * 4) >= 3) {
|
||||
data.z = (FLT)src_addr[2];
|
||||
}
|
||||
|
||||
// BUF_to_IMG(src_dtype, dst_dtype, SRC_FLT, DST_FLT, WRITE_IMAGE_OUT)
|
||||
BUF_to_IMG(float32, float32, float, float, write_imagef);
|
||||
BUF_to_IMG(float32, float16, float, half, write_imageh);
|
||||
BUF_to_IMG(float16, float16, half, half, write_imageh);
|
||||
BUF_to_IMG(int32, int32, float, float, write_imagef);
|
||||
BUF_to_IMG(uint32, uint32, float, float, write_imagef);
|
||||
|
||||
#define IMG_to_BUF(src_dtype, dst_dtype, SRC_FLT, DST_FLT, READ_IMAGE_IN) \
|
||||
__kernel void IMG_to_BUF_##src_dtype##_##dst_dtype(__read_only image2d_t src_data, __global DST_FLT##4 * dst_data, \
|
||||
int4 size, int4 shape) { \
|
||||
int X = get_global_id(0); \
|
||||
int Y = get_global_id(1); \
|
||||
int Z = get_global_id(2); \
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) { \
|
||||
return; \
|
||||
} \
|
||||
DST_FLT##4 data; \
|
||||
if (size.y * size.z <= MAX_IMAGE2D_WIDTH) \
|
||||
data = convert_##DST_FLT##4(READ_IMAGE_IN(src_data, smp_zero, (int2)(Y * size.z + Z, X))); \
|
||||
else \
|
||||
data = convert_##DST_FLT##4(READ_IMAGE_IN(src_data, smp_zero, (int2)(Z, X * size.y + Y))); \
|
||||
int offset = (X * shape.z + Y) * shape.w + Z * 4; \
|
||||
__global DST_FLT *dst_addr = (__global DST_FLT *)dst_data; \
|
||||
dst_addr += offset; \
|
||||
if ((Z + 1) * 4 <= shape.w) { \
|
||||
((__global DST_FLT##4 *)dst_addr)[0] = data; \
|
||||
} else { \
|
||||
if (shape.w - Z * 4 >= 1) { \
|
||||
dst_addr[0] = data.x; \
|
||||
} \
|
||||
if (shape.w - Z * 4 >= 2) { \
|
||||
dst_addr[1] = data.y; \
|
||||
} \
|
||||
if (shape.w - Z * 4 >= 3) { \
|
||||
dst_addr[2] = data.z; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
if (size.y * size.z <= MAX_IMAGE2D_WIDTH)
|
||||
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data);
|
||||
else
|
||||
WRITE_IMAGE(dst_data, (int2)(Z, X * size.y + Y), data);
|
||||
}
|
||||
__kernel void to_format_NHWC_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size,
|
||||
int4 shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
FLT4 data = (FLT4)(0.f);
|
||||
int offset = (X * shape.z + Y) * shape.w + Z * 4;
|
||||
__global half *src_addr = (__global half *)src_data;
|
||||
src_addr += offset;
|
||||
if ((Z + 1) * 4 <= shape.w) {
|
||||
data = TO_FLT4(((__global half4 *)src_addr)[0]);
|
||||
} else {
|
||||
if ((shape.w - Z * 4) >= 1) {
|
||||
data.x = (FLT)src_addr[0];
|
||||
}
|
||||
if ((shape.w - Z * 4) >= 2) {
|
||||
data.y = (FLT)src_addr[1];
|
||||
}
|
||||
if ((shape.w - Z * 4) >= 3) {
|
||||
data.z = (FLT)src_addr[2];
|
||||
}
|
||||
}
|
||||
if (size.y * size.z <= MAX_IMAGE2D_WIDTH)
|
||||
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data);
|
||||
else
|
||||
WRITE_IMAGE(dst_data, (int2)(Z, X * size.y + Y), data);
|
||||
}
|
||||
__kernel void to_format_NCHW_to_NHWC4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size,
|
||||
int4 shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
FLT4 data = (FLT4)(0.f);
|
||||
__global float *src_addr = (__global float *)src_data;
|
||||
__global float *src_addr_0 = src_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y;
|
||||
__global float *src_addr_1 = src_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y;
|
||||
__global float *src_addr_2 = src_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y;
|
||||
if ((Z + 1) * 4 <= shape.w) {
|
||||
data = TO_FLT4(((__global float4 *)src_addr_0)[0]);
|
||||
} else {
|
||||
if ((shape.w - Z * 4) >= 1) {
|
||||
data.x = src_addr_0[0];
|
||||
}
|
||||
if ((shape.w - Z * 4) >= 2) {
|
||||
data.y = src_addr_1[0];
|
||||
}
|
||||
if ((shape.w - Z * 4) >= 3) {
|
||||
data.z = src_addr_2[0];
|
||||
}
|
||||
}
|
||||
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data);
|
||||
}
|
||||
__kernel void to_format_NCHW_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size,
|
||||
int4 shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
FLT4 data = (FLT4)(0.f);
|
||||
__global half *src_addr = (__global half *)src_data;
|
||||
__global half *src_addr_0 = src_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y;
|
||||
__global half *src_addr_1 = src_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y;
|
||||
__global half *src_addr_2 = src_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y;
|
||||
if ((Z + 1) * 4 <= shape.w) {
|
||||
data = TO_FLT4(((__global half4 *)src_addr_0)[0]);
|
||||
} else {
|
||||
if ((shape.w - Z * 4) >= 1) {
|
||||
data.x = src_addr_0[0];
|
||||
}
|
||||
if ((shape.w - Z * 4) >= 2) {
|
||||
data.y = src_addr_1[0];
|
||||
}
|
||||
if ((shape.w - Z * 4) >= 3) {
|
||||
data.z = src_addr_2[0];
|
||||
}
|
||||
}
|
||||
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data);
|
||||
}
|
||||
__kernel void to_format_NHWC_to_NC4HW4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size,
|
||||
int4 shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z || shape.y == 0) {
|
||||
return;
|
||||
}
|
||||
int offset = (X / shape.y) * shape.y * shape.z * shape.w + ((X % shape.y) * shape.z + Y) * shape.w + Z * 4;
|
||||
__global float *src_addr = (__global float *)src_data;
|
||||
src_addr += offset;
|
||||
FLT4 data = (FLT4)(0.f);
|
||||
if ((Z + 1) * 4 <= shape.w) {
|
||||
data = TO_FLT4(((__global float4 *)src_addr)[0]);
|
||||
} else {
|
||||
if ((shape.w - Z * 4) >= 1) {
|
||||
data.x = (FLT)src_addr[0];
|
||||
}
|
||||
if ((shape.w - Z * 4) >= 2) {
|
||||
data.y = (FLT)src_addr[1];
|
||||
}
|
||||
if ((shape.w - Z * 4) >= 3) {
|
||||
data.z = (FLT)src_addr[2];
|
||||
}
|
||||
}
|
||||
int pos_ix = (X / shape.y) * size.z * shape.y + Z * shape.y + X % shape.y;
|
||||
WRITE_IMAGE(dst_data, (int2)(Y, pos_ix), data);
|
||||
}
|
||||
__kernel void to_format_NHWC_to_NC4HW4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size,
|
||||
int4 shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z || shape.y == 0) {
|
||||
return;
|
||||
}
|
||||
int offset = (X / shape.y) * shape.y * shape.z * shape.w + ((X % shape.y) * shape.z + Y) * shape.w + Z * 4;
|
||||
__global half *src_addr = (__global half *)src_data;
|
||||
src_addr += offset;
|
||||
FLT4 data = (FLT4)(0.f);
|
||||
if ((Z + 1) * 4 <= shape.w) {
|
||||
data = TO_FLT4(((__global half4 *)src_addr)[0]);
|
||||
} else {
|
||||
if ((shape.w - Z * 4) >= 1) {
|
||||
data.x = (FLT)src_addr[0];
|
||||
}
|
||||
if ((shape.w - Z * 4) >= 2) {
|
||||
data.y = (FLT)src_addr[1];
|
||||
}
|
||||
if ((shape.w - Z * 4) >= 3) {
|
||||
data.z = (FLT)src_addr[2];
|
||||
}
|
||||
}
|
||||
int pos_ix = (X / shape.y) * size.z * shape.y + Z * shape.y + X % shape.y;
|
||||
WRITE_IMAGE(dst_data, (int2)(Y, pos_ix), data);
|
||||
}
|
||||
__kernel void to_format_NHWC4_to_NHWC4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size,
|
||||
int4 shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), TO_FLT4(src_data[(X * size.y + Y) * size.z + Z]));
|
||||
}
|
||||
__kernel void to_format_NHWC4_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size,
|
||||
int4 shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), TO_FLT4(src_data[(X * size.y + Y) * size.z + Z]));
|
||||
}
|
||||
__kernel void to_format_NC4HW4_to_NC4HW4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data,
|
||||
int4 size, int4 shape) {
|
||||
// size(h, w, c4, 1), shape(n, c, h, w)
|
||||
int X = get_global_id(0); // h
|
||||
int Y = get_global_id(1); // w
|
||||
int Z = get_global_id(2); // c4
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), TO_FLT4(src_data[(Z * size.x + X) * size.y + Y]));
|
||||
}
|
||||
__kernel void to_format_NC4HW4_to_NC4HW4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size,
|
||||
int4 shape) {
|
||||
// size(h, w, c4, 1), shape(n, c, h, w)
|
||||
int X = get_global_id(0); // h
|
||||
int Y = get_global_id(1); // w
|
||||
int Z = get_global_id(2); // c4
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), TO_FLT4(src_data[(Z * size.x + X) * size.y + Y]));
|
||||
}
|
||||
__kernel void to_format_NCHW_to_NCHW_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size,
|
||||
int4 shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
dst_data[(Z * size.y + Y) * size.x + X] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z)));
|
||||
}
|
||||
__kernel void to_format_NCHW_to_NCHW_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
|
||||
int4 shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
dst_data[(Z * size.y + Y) * size.x + X] = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z)));
|
||||
}
|
||||
__kernel void to_format_NHWC4_to_NHWC_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size,
|
||||
int4 shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
float4 data;
|
||||
if (size.y * size.z <= MAX_IMAGE2D_WIDTH)
|
||||
data = convert_float4(READ_IMAGEIN(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
|
||||
else
|
||||
data = convert_float4(READ_IMAGEIN(src_data, smp_zero, (int2)(Z, X * size.y + Y)));
|
||||
int offset = (X * shape.z + Y) * shape.w + Z * 4;
|
||||
__global float *dst_addr = (__global float *)dst_data;
|
||||
dst_addr += offset;
|
||||
if ((Z + 1) * 4 <= shape.w) {
|
||||
((__global float4 *)dst_addr)[0] = data;
|
||||
} else {
|
||||
if (shape.w - Z * 4 >= 1) {
|
||||
dst_addr[0] = data.x;
|
||||
}
|
||||
if (shape.w - Z * 4 >= 2) {
|
||||
dst_addr[1] = data.y;
|
||||
}
|
||||
if (shape.w - Z * 4 >= 3) {
|
||||
dst_addr[2] = data.z;
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel void to_format_NHWC4_to_NCHW_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size,
|
||||
int4 shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
|
||||
int offset = (X * shape.z + Y) * shape.w + Z * 4;
|
||||
__global float *dst_addr = (__global float *)dst_data;
|
||||
__global float *dst_addr_0 = dst_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y;
|
||||
__global float *dst_addr_1 = dst_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y;
|
||||
__global float *dst_addr_2 = dst_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y;
|
||||
dst_addr += offset;
|
||||
if ((Z + 1) * 4 <= shape.w) {
|
||||
((__global float4 *)dst_addr_0)[0] = data;
|
||||
} else {
|
||||
if (shape.w - Z * 4 >= 1) {
|
||||
dst_addr_0[0] = data.x;
|
||||
}
|
||||
if (shape.w - Z * 4 >= 2) {
|
||||
dst_addr_1[0] = data.y;
|
||||
}
|
||||
if (shape.w - Z * 4 >= 3) {
|
||||
dst_addr_2[0] = data.z;
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel void to_format_NHWC4_to_NCHW_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
|
||||
int4 shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
|
||||
int offset = (X * shape.z + Y) * shape.w + Z * 4;
|
||||
__global half *dst_addr = (__global half *)dst_data;
|
||||
__global half *dst_addr_0 = dst_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y;
|
||||
__global half *dst_addr_1 = dst_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y;
|
||||
__global half *dst_addr_2 = dst_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y;
|
||||
dst_addr += offset;
|
||||
if ((Z + 1) * 4 <= shape.w) {
|
||||
((__global half4 *)dst_addr_0)[0] = data;
|
||||
} else {
|
||||
if (shape.w - Z * 4 >= 1) {
|
||||
dst_addr_0[0] = data.x;
|
||||
}
|
||||
if (shape.w - Z * 4 >= 2) {
|
||||
dst_addr_1[0] = data.y;
|
||||
}
|
||||
if (shape.w - Z * 4 >= 3) {
|
||||
dst_addr_2[0] = data.z;
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel void to_format_NHWC4_to_NHWC_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
|
||||
int4 shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
half4 data;
|
||||
if (size.y * size.z <= MAX_IMAGE2D_WIDTH)
|
||||
data = convert_half4(READ_IMAGEIN(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
|
||||
else
|
||||
data = convert_half4(READ_IMAGEIN(src_data, smp_zero, (int2)(Z, X * size.y + Y)));
|
||||
int offset = (X * shape.z + Y) * shape.w + Z * 4;
|
||||
__global half *dst_addr = (__global half *)dst_data;
|
||||
dst_addr += offset;
|
||||
if ((Z + 1) * 4 <= shape.w) {
|
||||
((__global half4 *)dst_addr)[0] = data;
|
||||
} else {
|
||||
if (shape.w - Z * 4 >= 1) {
|
||||
dst_addr[0] = data.x;
|
||||
}
|
||||
if (shape.w - Z * 4 >= 2) {
|
||||
dst_addr[1] = data.y;
|
||||
}
|
||||
if (shape.w - Z * 4 >= 3) {
|
||||
dst_addr[2] = data.z;
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel void to_format_NC4HW4_to_NHWC_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size,
|
||||
int4 shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z || shape.y == 0) {
|
||||
return;
|
||||
}
|
||||
int pos_ix = (X / shape.y) * size.z * shape.y + Z * shape.y + X % shape.y;
|
||||
float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y, pos_ix)));
|
||||
int offset = (X / shape.y) * shape.y * shape.z * shape.w + ((X % shape.y) * shape.z + Y) * shape.w + Z * 4;
|
||||
__global float *dst_addr = (__global float *)dst_data;
|
||||
dst_addr += offset;
|
||||
if ((Z + 1) * 4 <= shape.w) {
|
||||
((__global float4 *)dst_addr)[0] = data;
|
||||
} else {
|
||||
if (shape.w - Z * 4 >= 1) {
|
||||
dst_addr[0] = data.x;
|
||||
}
|
||||
if (shape.w - Z * 4 >= 2) {
|
||||
dst_addr[1] = data.y;
|
||||
}
|
||||
if (shape.w - Z * 4 >= 3) {
|
||||
dst_addr[2] = data.z;
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel void to_format_NC4HW4_to_NHWC_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
|
||||
int4 shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z || shape.y == 0) {
|
||||
return;
|
||||
}
|
||||
int pos_ix = (X / shape.y) * size.z * shape.y + Z * shape.y + X % shape.y;
|
||||
half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y, pos_ix)));
|
||||
int offset = (X / shape.y) * shape.y * shape.z * shape.w + ((X % shape.y) * shape.z + Y) * shape.w + Z * 4;
|
||||
__global half *dst_addr = (__global half *)dst_data;
|
||||
dst_addr += offset;
|
||||
if ((Z + 1) * 4 <= shape.w) {
|
||||
((__global half4 *)dst_addr)[0] = data;
|
||||
} else {
|
||||
if (shape.w - Z * 4 >= 1) {
|
||||
dst_addr[0] = data.x;
|
||||
}
|
||||
if (shape.w - Z * 4 >= 2) {
|
||||
dst_addr[1] = data.y;
|
||||
}
|
||||
if (shape.w - Z * 4 >= 3) {
|
||||
dst_addr[2] = data.z;
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel void to_format_NC4HW4_to_NC4HW4_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size,
|
||||
int4 shape) {
|
||||
// size(h, w, c, 1), shape(n, c, h, w)
|
||||
int X = get_global_id(0); // h
|
||||
int Y = get_global_id(1); // w
|
||||
int Z = get_global_id(2); // c
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
dst_data[(Z * size.x + X) * size.y + Y] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X)));
|
||||
}
|
||||
__kernel void to_format_NC4HW4_to_NC4HW4_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
|
||||
int4 shape) {
|
||||
// size(h, w, c, 1), shape(n, c, h, w)
|
||||
int X = get_global_id(0); // h
|
||||
int Y = get_global_id(1); // w
|
||||
int Z = get_global_id(2); // c
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
dst_data[(Z * size.x + X) * size.y + Y] = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X)));
|
||||
}
|
||||
__kernel void to_format_NHWC4_to_NHWC4_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size,
|
||||
int4 shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
dst_data[(X * size.y + Y) * size.z + Z] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
|
||||
}
|
||||
__kernel void to_format_NHWC4_to_NHWC4_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
|
||||
int4 shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
dst_data[(X * size.y + Y) * size.z + Z] = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
|
||||
}
|
||||
|
||||
// IMG_to_BUF(src_dtype, dst_dtype, SRC_FLT, DST_FLT, READ_IMAGE_IN)
|
||||
IMG_to_BUF(float32, float32, float, float, read_imagef);
|
||||
IMG_to_BUF(float16, float32, half, float, read_imageh);
|
||||
IMG_to_BUF(float16, float16, half, half, read_imageh);
|
||||
IMG_to_BUF(int32, int32, float, float, read_imagef);
|
||||
IMG_to_BUF(uint32, uint32, float, float, read_imagef);
|
||||
|
|
|
@ -44,6 +44,19 @@ using mindspore::schema::PrimitiveType_Eltwise;
|
|||
namespace mindspore::kernel {
|
||||
|
||||
int ArithmeticOpenCLKernel::CheckSpecs() {
|
||||
for (auto &tensor : in_tensors_) {
|
||||
if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) {
|
||||
MS_LOG(ERROR) << "ArithmeticOpenCLKernel only support fp32/fp16 input";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
for (auto &tensor : out_tensors_) {
|
||||
if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) {
|
||||
MS_LOG(ERROR) << "ArithmeticOpenCLKernel only support fp32/fp16 output";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
if (in_tensors_.size() != 2 || out_tensors_.size() != 1) {
|
||||
MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
|
||||
return RET_ERROR;
|
||||
|
|
|
@ -286,6 +286,7 @@ void Conv2DOpenCLKernel::InitFilter() {
|
|||
}
|
||||
|
||||
FreeDequantedWeight();
|
||||
FreeTmpWeight(in_tensors_.at(kWeightIndex)->data_c());
|
||||
}
|
||||
|
||||
void Conv2DOpenCLKernel::InitBias() {
|
||||
|
@ -322,6 +323,7 @@ void Conv2DOpenCLKernel::InitBias() {
|
|||
}
|
||||
}
|
||||
allocator->UnmapBuffer(packed_bias_);
|
||||
FreeTmpWeight(in_tensors_.at(kBiasIndex)->data_c());
|
||||
}
|
||||
|
||||
void Conv2DOpenCLKernel::SetConstArgs() {
|
||||
|
@ -480,11 +482,9 @@ kernel::LiteKernel *OpenCLConv2DCreator(const std::vector<lite::Tensor *> &input
|
|||
MS_ASSERT(!inputs.empty());
|
||||
MS_ASSERT(!outputs.empty());
|
||||
MS_ASSERT(opParameter);
|
||||
MS_ASSERT(inputs.front()->shape().size() == 4);
|
||||
MS_ASSERT(outputs.front()->shape().size() == 4);
|
||||
auto *conv_param = reinterpret_cast<ConvParameter *>(opParameter);
|
||||
int input_channel = inputs.front()->shape().at(3);
|
||||
int output_channel = outputs.front()->shape().at(3);
|
||||
int input_channel = conv_param->input_channel_;
|
||||
int output_channel = conv_param->output_channel_;
|
||||
int group = conv_param->group_;
|
||||
|
||||
// case 1: depthwise conv2d
|
||||
|
@ -529,6 +529,10 @@ kernel::LiteKernel *OpenCLConv2DCreator(const std::vector<lite::Tensor *> &input
|
|||
}
|
||||
}
|
||||
if (!infer_shape_done) {
|
||||
StoreTmpWeight(inputs.at(kWeightIndex));
|
||||
if (inputs.size() > kBiasIndex) {
|
||||
StoreTmpWeight(inputs.at(kBiasIndex));
|
||||
}
|
||||
MS_LOG(WARNING) << "kernel don't infer shape yet!";
|
||||
return kernel;
|
||||
}
|
||||
|
|
|
@ -97,10 +97,11 @@ int ReshapeOpenCLKernel::Run() {
|
|||
}
|
||||
|
||||
int ReshapeOpenCLKernel::PreProcess() {
|
||||
if (Type() == PrimitiveType_Reshape && !infer_shape_flag_) {
|
||||
if (Type() == PrimitiveType_Reshape && !op_parameter_->infer_flag_) {
|
||||
auto shape_tensor = in_tensors_[1];
|
||||
if (!shape_tensor->IsConst()) {
|
||||
ocl_runtime_->SyncCommandQueue();
|
||||
shape_tensor->MutableData();
|
||||
}
|
||||
}
|
||||
return OpenCLKernel::PreProcess();
|
||||
|
|
|
@ -33,7 +33,7 @@ using mindspore::schema::PrimitiveType_Resize;
|
|||
namespace mindspore::kernel {
|
||||
|
||||
int ResizeOpenCLKernel::CheckSpecs() {
|
||||
if (in_tensors_.size() != 1 || out_tensors_.size() != 1) {
|
||||
if (in_tensors_.size() != 2 || out_tensors_.size() != 1) {
|
||||
MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
@ -119,6 +119,17 @@ int ResizeOpenCLKernel::Run() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int ResizeOpenCLKernel::PreProcess() {
|
||||
if (Type() == PrimitiveType_Resize && !op_parameter_->infer_flag_) {
|
||||
auto shape_tensor = in_tensors_[1];
|
||||
if (!shape_tensor->IsConst()) {
|
||||
ocl_runtime_->SyncCommandQueue();
|
||||
shape_tensor->MutableData();
|
||||
}
|
||||
}
|
||||
return OpenCLKernel::PreProcess();
|
||||
}
|
||||
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Resize, OpenCLKernelCreator<ResizeOpenCLKernel>)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Resize, OpenCLKernelCreator<ResizeOpenCLKernel>)
|
||||
} // namespace mindspore::kernel
|
||||
|
|
|
@ -34,6 +34,7 @@ class ResizeOpenCLKernel : public OpenCLKernel {
|
|||
int CheckSpecs() override;
|
||||
void SetConstArgs() override;
|
||||
void SetGlobalLocal() override;
|
||||
int PreProcess() override;
|
||||
|
||||
private:
|
||||
float getResizeScaleFactor(int input_size, int output_size);
|
||||
|
|
|
@ -71,8 +71,20 @@ int StackOpenCLKernel::CheckSpecs() {
|
|||
MS_LOG(ERROR) << " only support input size = 2 and output size = 1";
|
||||
return RET_ERROR;
|
||||
}
|
||||
for (auto &tensor : in_tensors_) {
|
||||
if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) {
|
||||
MS_LOG(ERROR) << " only support fp32/fp16 input";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
for (auto &tensor : out_tensors_) {
|
||||
if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) {
|
||||
MS_LOG(ERROR) << " only support fp32/fp16 output";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
if (in_tensors_[0]->shape().size() > 4 || in_tensors_[0]->shape().size() <= 0) {
|
||||
MS_LOG(ERROR) << " only support dim <= 4 ";
|
||||
MS_LOG(ERROR) << " only support 0<dim<=4";
|
||||
return RET_ERROR;
|
||||
}
|
||||
axis_ = axis_ < 0 ? axis_ + in_tensors_[0]->shape().size() : axis_;
|
||||
|
|
|
@ -64,32 +64,22 @@ void ToFormatOpenCLKernel::SetGlobalLocal() {
|
|||
}
|
||||
|
||||
int ToFormatOpenCLKernel::Prepare() {
|
||||
std::map<TypeId, std::string> dtype_str{
|
||||
{kNumberTypeFloat32, "float"}, {kNumberTypeFloat16, "half"}, {kNumberTypeInt32, "float"}};
|
||||
std::string kernel_name;
|
||||
if (out_mem_type_ == MemType::IMG) {
|
||||
kernel_name = "to_format_NHWC_to_NHWC4_IMG_" + dtype_str[in_tensors_.front()->data_type()];
|
||||
} else {
|
||||
kernel_name = "to_format_NHWC4_to_NHWC_BUF_" + dtype_str[out_tensors_.front()->data_type()];
|
||||
}
|
||||
static std::map<TypeId, std::string> dtype_str{{kNumberTypeFloat32, "float32"},
|
||||
{kNumberTypeFloat16, "float16"},
|
||||
{kNumberTypeInt32, "int32"},
|
||||
{kNumberTypeUInt32, "uint32"}};
|
||||
auto in_tensor = in_tensors_.front();
|
||||
auto out_tensor = out_tensors_.front();
|
||||
std::string kernel_name = out_mem_type_ == MemType::IMG ? "BUF_to_IMG_" : "IMG_to_BUF_";
|
||||
kernel_name += dtype_str[in_tensor->data_type()] + "_" + dtype_str[out_tensor->data_type()];
|
||||
this->set_name(kernel_name);
|
||||
|
||||
#ifdef PROGRAM_WITH_IL
|
||||
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
|
||||
#else
|
||||
std::string program_name = "to_format";
|
||||
std::string source = to_format_source;
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
std::vector<std::string> ext_build_opt;
|
||||
if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
|
||||
ext_build_opt.push_back("-DREAD_IMAGEIN=read_imagef");
|
||||
} else {
|
||||
ext_build_opt.push_back("-DREAD_IMAGEIN=read_imageh");
|
||||
}
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt);
|
||||
#endif
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
|
||||
|
||||
auto output = GpuTensorInfo(out_tensors_.front());
|
||||
auto output = GpuTensorInfo(out_tensor);
|
||||
N_ = output.N;
|
||||
H_ = output.H;
|
||||
W_ = output.W;
|
||||
|
@ -112,15 +102,8 @@ int ToFormatOpenCLKernel::Run() {
|
|||
}
|
||||
|
||||
int ToFormatOpenCLKernel::InferShape() {
|
||||
if (infer_shape_flag_) {
|
||||
return RET_OK;
|
||||
}
|
||||
if (in_tensors_[0]->shape().size() == 0 || in_tensors_[0]->ElementsNum() < 0) {
|
||||
MS_LOG(ERROR) << "to_format op in tensor shape is 0, infer shape failed!";
|
||||
return RET_ERROR;
|
||||
}
|
||||
out_tensors_[0]->set_shape(in_tensors_[0]->shape());
|
||||
infer_shape_flag_ = true;
|
||||
op_parameter_->infer_flag_ = false;
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -97,7 +97,7 @@ int OpenCLKernel::GetImageSize(size_t idx, lite::opencl::ImageSize *img_size) {
|
|||
}
|
||||
|
||||
void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) {
|
||||
printf("%-30s", name().c_str());
|
||||
printf("%-30s ", name().c_str());
|
||||
if (out_tensors().empty()) {
|
||||
return;
|
||||
}
|
||||
|
@ -134,7 +134,9 @@ void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) {
|
|||
|
||||
auto total_num = mem_type == lite::opencl::MemType::BUF ? img_info.ElementsNum : img_info.ElementsC4Num;
|
||||
for (int i = 0; i < print_num && i < total_num; ++i) {
|
||||
if (tensor->data_type() == kNumberTypeFloat16) {
|
||||
if (tensor->data_type() == kNumberTypeInt32) {
|
||||
printf("%d %7d | ", i, reinterpret_cast<int32_t *>(data.data())[i]);
|
||||
} else if (tensor->data_type() == kNumberTypeFloat16) {
|
||||
printf("%d %7.3f | ", i, reinterpret_cast<float16_t *>(data.data())[i]);
|
||||
} else {
|
||||
printf("%d %7.3f | ", i, reinterpret_cast<float *>(data.data())[i]);
|
||||
|
@ -191,7 +193,7 @@ int OpenCLKernel::PostProcess() {
|
|||
}
|
||||
|
||||
int OpenCLKernel::InferShape() {
|
||||
if (infer_shape_flag_) {
|
||||
if (op_parameter_->infer_flag_) {
|
||||
return RET_OK;
|
||||
}
|
||||
op_parameter_->infer_flag_ = true;
|
||||
|
@ -202,12 +204,11 @@ int OpenCLKernel::InferShape() {
|
|||
op_parameter_->infer_flag_ = false;
|
||||
return ret;
|
||||
}
|
||||
infer_shape_flag_ = true;
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int OpenCLKernel::ReSize() {
|
||||
if (infer_shape_flag_) {
|
||||
if (op_parameter_->infer_flag_) {
|
||||
return RET_OK;
|
||||
}
|
||||
auto ret = InferShape();
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
#include "src/runtime/gpu/opencl/opencl_runtime.h"
|
||||
#include "mindspore/lite/src/dequant.h"
|
||||
#include "src/runtime/kernel/opencl/utils.h"
|
||||
#include "nnacl/resize_parameter.h"
|
||||
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
|
@ -35,15 +36,15 @@ namespace mindspore::kernel {
|
|||
|
||||
struct OpenCLToFormatParameter {
|
||||
OpParameter op_parameter{};
|
||||
schema::Format src_format{schema::Format::Format_NHWC};
|
||||
schema::Format dst_format{schema::Format::Format_NHWC4};
|
||||
lite::opencl::MemType out_mem_type{lite::opencl::MemType::IMG};
|
||||
};
|
||||
|
||||
template <typename SrcT, typename DstT>
|
||||
void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) {
|
||||
MS_ASSERT(dst);
|
||||
MS_ASSERT(src);
|
||||
if (src == nullptr || src_num <= 0) {
|
||||
return;
|
||||
}
|
||||
auto *N = dst;
|
||||
auto *H = dst + 1;
|
||||
auto *W = dst + 2;
|
||||
|
@ -70,10 +71,12 @@ void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) {
|
|||
template <typename SrcT, typename DstT>
|
||||
void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num, DstT default_value) {
|
||||
MS_ASSERT(dst);
|
||||
MS_ASSERT(src);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
dst[i] = default_value;
|
||||
}
|
||||
if (src == nullptr || src_num <= 0) {
|
||||
return;
|
||||
}
|
||||
Broadcast2GpuShape(dst, src, src_num);
|
||||
}
|
||||
|
||||
|
@ -92,6 +95,10 @@ struct GpuTensorInfo {
|
|||
H = shape.s[1];
|
||||
W = shape.s[2];
|
||||
C = shape.s[3];
|
||||
MS_ASSERT(N > 0);
|
||||
MS_ASSERT(H > 0);
|
||||
MS_ASSERT(W > 0);
|
||||
MS_ASSERT(C > 0);
|
||||
Slice = UP_DIV(C, C4NUM);
|
||||
|
||||
FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float);
|
||||
|
@ -167,7 +174,6 @@ class OpenCLKernel : public LiteKernel {
|
|||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
|
||||
: LiteKernel(parameter, inputs, outputs, ctx) {
|
||||
ocl_runtime_ = ocl_runtime_wrap_.GetInstance();
|
||||
infer_shape_flag_ = parameter->infer_flag_;
|
||||
}
|
||||
~OpenCLKernel() override = default;
|
||||
int AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local);
|
||||
|
@ -199,8 +205,6 @@ class OpenCLKernel : public LiteKernel {
|
|||
int DequantWeight();
|
||||
void FreeDequantedWeight();
|
||||
virtual int InferShape();
|
||||
bool GetInferShapeFlag() { return infer_shape_flag_; }
|
||||
void SetInferShapeFlag(bool flag) { infer_shape_flag_ = flag; }
|
||||
|
||||
protected:
|
||||
static std::set<size_t> GenerateLocalByGlobal(size_t global_i);
|
||||
|
@ -225,7 +229,6 @@ class OpenCLKernel : public LiteKernel {
|
|||
cl::Event event_;
|
||||
void *restore_quant_data_{nullptr};
|
||||
bool dequant_flag_{false};
|
||||
bool infer_shape_flag_{false};
|
||||
|
||||
private:
|
||||
lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_;
|
||||
|
@ -241,7 +244,7 @@ kernel::LiteKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &input
|
|||
free(opParameter);
|
||||
return nullptr;
|
||||
}
|
||||
if (!reinterpret_cast<kernel::OpenCLKernel *>(kernel)->GetInferShapeFlag()) {
|
||||
if (!opParameter->infer_flag_) {
|
||||
MS_LOG(WARNING) << "kernel don't infer shape yet!";
|
||||
return kernel;
|
||||
}
|
||||
|
|
|
@ -121,8 +121,6 @@ int OpenCLSubGraph::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors,
|
|||
|
||||
for (size_t i = 0; i < in_tensors.size(); ++i) {
|
||||
auto *in_tensor = in_tensors.at(i);
|
||||
auto dst_format = (mem_type == MemType::IMG) ? schema::Format::Format_NHWC4 : schema::Format::Format_NHWC;
|
||||
auto src_format = (mem_type == MemType::IMG) ? schema::Format::Format_NHWC : schema::Format::Format_NHWC4;
|
||||
auto *new_tensor = new (std::nothrow)
|
||||
lite::Tensor(in_tensor->data_type(), in_tensor->shape(), in_tensor->format(), lite::Tensor::VAR);
|
||||
MS_ASSERT(new_tensor);
|
||||
|
@ -130,20 +128,9 @@ int OpenCLSubGraph::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors,
|
|||
MS_LOG(ERROR) << "OpenCLSubGraph new tensor failed!";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (mem_type == MemType::IMG) {
|
||||
new_tensor->set_format(dst_format);
|
||||
in_tensor->set_format(src_format);
|
||||
} else {
|
||||
new_tensor->set_format(src_format);
|
||||
in_tensor->set_format(dst_format);
|
||||
}
|
||||
|
||||
out_tensors->emplace_back(new_tensor);
|
||||
KernelKey desc{kGPU, kNumberTypeFloat32, PRIM_TO_FORMAT};
|
||||
if (mem_type == MemType::IMG && ocl_runtime_->GetFp16Enable()) {
|
||||
desc.data_type = kNumberTypeFloat16;
|
||||
new_tensor->set_data_type(kNumberTypeFloat16);
|
||||
}
|
||||
auto *parameter = static_cast<OpenCLToFormatParameter *>(malloc(sizeof(OpenCLToFormatParameter)));
|
||||
MS_ASSERT(parameter);
|
||||
if (parameter == nullptr) {
|
||||
|
@ -153,16 +140,7 @@ int OpenCLSubGraph::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors,
|
|||
return RET_ERROR;
|
||||
}
|
||||
parameter->op_parameter.type_ = PRIM_TO_FORMAT;
|
||||
bool output_shape_setted = true;
|
||||
for (auto output : *out_tensors) {
|
||||
if (output->shape().empty() || output->ElementsNum() < 0) {
|
||||
output_shape_setted = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
parameter->op_parameter.infer_flag_ = output_shape_setted;
|
||||
parameter->src_format = src_format;
|
||||
parameter->dst_format = dst_format;
|
||||
parameter->op_parameter.infer_flag_ = false;
|
||||
parameter->out_mem_type = mem_type;
|
||||
out_parameters->emplace_back(parameter);
|
||||
LiteKernel *in_convert_op = nullptr;
|
||||
|
@ -255,8 +233,7 @@ int OpenCLSubGraph::Init() {
|
|||
|
||||
int OpenCLSubGraph::UpdateTensorDataTypePass() {
|
||||
bool is_fp16 = ocl_runtime_->GetFp16Enable();
|
||||
MS_ASSERT(in_tensors_[0]);
|
||||
if (is_fp16 && (in_tensors_[0]->data_type() == kNumberTypeFloat32)) {
|
||||
if (is_fp16) {
|
||||
std::set<lite::Tensor *> out_set;
|
||||
out_set.insert(in_tensors_.begin(), in_tensors_.end());
|
||||
out_set.insert(out_tensors_.begin(), out_tensors_.end());
|
||||
|
@ -330,16 +307,6 @@ void OpenCLSubGraph::GetInOutNodes() {
|
|||
}
|
||||
}
|
||||
|
||||
bool OpenCLSubGraph::IsSubGraphInferShapeDone() {
|
||||
for (auto node : this->nodes_) {
|
||||
auto opencl_kernel = reinterpret_cast<kernel::OpenCLKernel *>(node);
|
||||
if (!opencl_kernel->GetInferShapeFlag()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int OpenCLSubGraph::Prepare() {
|
||||
for (const auto tensor : in_tensors_) {
|
||||
MS_ASSERT(tensor);
|
||||
|
@ -354,7 +321,6 @@ int OpenCLSubGraph::Prepare() {
|
|||
MS_LOG(ERROR) << "Create OpenCLExecutor fail";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto ret = RET_OK;
|
||||
for (auto node : this->nodes_) {
|
||||
if (node == nullptr) {
|
||||
MS_LOG(ERROR) << "node in Subgraph is nullptr";
|
||||
|
@ -363,26 +329,28 @@ int OpenCLSubGraph::Prepare() {
|
|||
auto opencl_kernel = reinterpret_cast<kernel::OpenCLKernel *>(node);
|
||||
std::set<int> pre_init_weight_list = {schema::PrimitiveType_MatMul, schema::PrimitiveType_BiasAdd};
|
||||
if (pre_init_weight_list.find(opencl_kernel->Type()) != pre_init_weight_list.end()) {
|
||||
ret = opencl_kernel->InitWeights();
|
||||
auto ret = opencl_kernel->InitWeights();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "init weights " << node->name() << " failed";
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
if (opencl_kernel->GetInferShapeFlag()) {
|
||||
ret = node->Prepare();
|
||||
if (opencl_kernel->op_parameter()->infer_flag_) {
|
||||
auto ret = node->Prepare();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "prepare node " << node->name() << " failed";
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
auto opencl_exec = reinterpret_cast<lite::opencl::OpenCLExecutor *>(executor_);
|
||||
// If tuning_mode is DEFAULT, just malloc memory for reuse.
|
||||
ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Run opencl executor failed: " << ret;
|
||||
return ret;
|
||||
if (all_kernels_infer_done_) {
|
||||
auto opencl_exec = reinterpret_cast<lite::opencl::OpenCLExecutor *>(executor_);
|
||||
// If tuning_mode is DEFAULT, just malloc memory for reuse.
|
||||
auto ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Run opencl executor failed: " << ret;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -423,7 +391,7 @@ int OpenCLSubGraph::ReSize(bool interrupt) {
|
|||
for (auto &output : outputs) {
|
||||
output->FreeData();
|
||||
}
|
||||
opencl_kernel->SetInferShapeFlag(false);
|
||||
opencl_kernel->op_parameter()->infer_flag_ = false;
|
||||
}
|
||||
for (auto kernel : nodes_) {
|
||||
auto opencl_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel);
|
||||
|
|
|
@ -36,6 +36,9 @@ class OpenCLSubGraph : public SubGraphKernel {
|
|||
subgraph_type_ = kGpuSubGraph;
|
||||
this->name_ = "GpuSubGraph";
|
||||
nodes_set_.insert(nodes.begin(), nodes.end());
|
||||
all_kernels_infer_done_ = std::all_of(nodes_.begin(), nodes_.end(), [](const kernel::LiteKernel *kernel) {
|
||||
return kernel && kernel->op_parameter() && kernel->op_parameter()->infer_flag_;
|
||||
});
|
||||
}
|
||||
~OpenCLSubGraph() override;
|
||||
|
||||
|
@ -48,7 +51,6 @@ class OpenCLSubGraph : public SubGraphKernel {
|
|||
int Run() override;
|
||||
int Run(const KernelCallBack &before, const KernelCallBack &after) override;
|
||||
int InsertOpsPass();
|
||||
bool IsSubGraphInferShapeDone();
|
||||
|
||||
private:
|
||||
void UnInit();
|
||||
|
@ -83,6 +85,7 @@ class OpenCLSubGraph : public SubGraphKernel {
|
|||
std::set<LiteKernel *> nodes_set_;
|
||||
lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_;
|
||||
lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr};
|
||||
bool all_kernels_infer_done_ = false;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -296,4 +296,27 @@ int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tens
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
static std::set<void *> tmp_weights;
|
||||
|
||||
void StoreTmpWeight(lite::Tensor *tensor) {
|
||||
MS_LOG(WARNING) << "store weight when kernel don't infer shape!";
|
||||
if (tensor && tensor->data_c() && tensor->Size()) {
|
||||
void *new_data = malloc(tensor->Size());
|
||||
MS_ASSERT(new_data);
|
||||
if (new_data == nullptr) {
|
||||
return;
|
||||
}
|
||||
memcpy(new_data, tensor->data_c(), tensor->Size());
|
||||
tensor->set_data(new_data);
|
||||
tmp_weights.insert(new_data);
|
||||
}
|
||||
}
|
||||
|
||||
void FreeTmpWeight(void *data) {
|
||||
if (tmp_weights.count(data)) {
|
||||
free(data);
|
||||
tmp_weights.erase(data);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace mindspore::kernel
|
||||
|
|
|
@ -63,6 +63,9 @@ void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, c
|
|||
int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tensor_name, lite::Tensor *tensor,
|
||||
TypeId expect_data_type, const std::vector<int> &expect_shape);
|
||||
|
||||
void StoreTmpWeight(lite::Tensor *tensor);
|
||||
void FreeTmpWeight(void *tensor);
|
||||
|
||||
template <class T1, class T2>
|
||||
void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane_in, int plane_out, int channel,
|
||||
const std::function<T2(T1)> &to_dtype) {
|
||||
|
|
|
@ -224,7 +224,7 @@ int32_t Tensor::ElementsC4Num() const {
|
|||
if (this->category_ == CONST_SCALAR) {
|
||||
return 1;
|
||||
}
|
||||
int32_t result = 0;
|
||||
int32_t result = 1;
|
||||
if (this->shape_.size() == 4) {
|
||||
result = Batch() * Height() * Width() * ((Channel() + 3) / 4 * 4);
|
||||
} else if (this->shape_.size() == 2) {
|
||||
|
|
Loading…
Reference in New Issue