forked from mindspore-Ecosystem/mindspore
!5609 [MS][LITE][Develop] concat ops support muti-dimension
Merge pull request !5609 from pengyongrong/concat
This commit is contained in:
commit
e85a7a979a
|
@ -2,39 +2,106 @@
|
|||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
||||
|
||||
__kernel void Concat(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output,
|
||||
int2 input_channels, int4 output_shape) {
|
||||
int X = get_global_id(0); // H
|
||||
int4 input_shape0, int4 input_shape1, int4 output_shape, const int axis) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
if (Z < input_channels.x) {
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_channels.x + Z, (X)));
|
||||
if (axis == 0) {
|
||||
if (X < input_shape0.x * input_shape0.y) {
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
} else {
|
||||
FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_channels.y + Z - input_channels.x, (X)));
|
||||
FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.x * input_shape0.y)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
} else if (axis == 1) {
|
||||
if (X < input_shape0.y) {
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
} else {
|
||||
FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.y)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
} else if (axis == 2) {
|
||||
if (Y < input_shape0.z) {
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
} else {
|
||||
FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z) * input_shape1.w + Z, (X)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
} else {
|
||||
if (Z < input_shape0.w) {
|
||||
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
} else {
|
||||
FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z - input_shape0.w, (X)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void Concat3input(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2,
|
||||
__write_only image2d_t output, int3 input_channels, int4 output_shape) {
|
||||
int X = get_global_id(0); // H
|
||||
__write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2,
|
||||
int4 output_shape, const int axis) {
|
||||
int X = get_global_id(0); // N*H
|
||||
int Y = get_global_id(1); // W
|
||||
int Z = get_global_id(2); // c/4
|
||||
if (X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
if (Z < input_channels.x) {
|
||||
FLT4 result0 = READ_IMAGE(input0, smp_none, (int2)((Y)*input_channels.x + Z, (X)));
|
||||
if (axis == 0) {
|
||||
if (X < input_shape0.x * input_shape0.y) {
|
||||
FLT4 result0 = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result0);
|
||||
} else if (Z < (input_channels.x + input_channels.y)) {
|
||||
FLT4 result1 = READ_IMAGE(input1, smp_none, (int2)((Y)*input_channels.y + Z - input_channels.x, (X)));
|
||||
} else if (X < (input_shape0.x * input_shape0.y + input_shape1.x * input_shape1.y)) {
|
||||
FLT4 result1 =
|
||||
READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.x * input_shape0.y)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result1);
|
||||
} else {
|
||||
FLT4 result2 = READ_IMAGE(
|
||||
input2, smp_none,
|
||||
(int2)((Y)*input_shape2.w + Z, (X - input_shape0.x * input_shape0.y - input_shape1.x * input_shape1.y)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result2);
|
||||
}
|
||||
} else if (axis == 1) {
|
||||
if (X < input_shape0.y) {
|
||||
FLT4 result0 = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result0);
|
||||
} else if (X < (input_shape0.y + input_shape1.y)) {
|
||||
FLT4 result1 = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.y)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result1);
|
||||
} else {
|
||||
FLT4 result2 =
|
||||
READ_IMAGE(input2, smp_none, (int2)((Y)*input_channels.z + Z - input_channels.x - input_channels.y, (X)));
|
||||
READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z, (X - input_shape0.y - input_shape1.y)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result2);
|
||||
}
|
||||
} else if (axis == 2) {
|
||||
if (Y < input_shape0.z) {
|
||||
FLT4 result0 = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result0);
|
||||
} else if (Y < (input_shape0.z + input_shape0.z)) {
|
||||
FLT4 result1 = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z) * input_shape1.w + Z, (X)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result1);
|
||||
} else {
|
||||
FLT4 result2 =
|
||||
READ_IMAGE(input2, smp_none, (int2)((Y - input_shape0.z - input_shape1.z) * input_shape2.w + Z, (X)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result2);
|
||||
}
|
||||
} else {
|
||||
if (Z < input_shape0.w) {
|
||||
FLT4 result0 = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result0);
|
||||
} else if (Z < (input_shape0.w + input_shape0.w)) {
|
||||
FLT4 result1 = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z - input_shape0.w, (X)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result1);
|
||||
} else {
|
||||
FLT4 result2 =
|
||||
READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z - input_shape0.w - input_shape1.w, (X)));
|
||||
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -141,7 +141,7 @@ kernel::LiteKernel *OpenCLBatchnormKernelCreator(const std::vector<lite::tensor:
|
|||
}
|
||||
auto ret = kernel->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Init kernel failed, name: Convolution";
|
||||
MS_LOG(ERROR) << " Init kernel failed, name: Batchnorm ";
|
||||
delete kernel;
|
||||
return nullptr;
|
||||
}
|
||||
|
|
|
@ -33,10 +33,10 @@ int ConcatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
|
|||
size_t im_dst_x, im_dst_y;
|
||||
if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
|
||||
im_dst_x = out_tensors_[0]->Width() * CO4;
|
||||
im_dst_y = out_tensors_[0]->Height();
|
||||
im_dst_y = out_tensors_[0]->Height() * out_tensors_[0]->Batch();
|
||||
} else {
|
||||
im_dst_y = out_tensors_[0]->Height() * CO4;
|
||||
im_dst_x = out_tensors_[0]->Width();
|
||||
im_dst_x = out_tensors_[0]->Width() * out_tensors_[0]->Batch();
|
||||
}
|
||||
size_t img_dtype = CL_FLOAT;
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
|
@ -57,14 +57,11 @@ int ConcatOpenCLKernel::Init() {
|
|||
|
||||
auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
|
||||
MS_LOG(DEBUG) << " concat at axis=: " << param->axis_;
|
||||
if (param->axis_ != 0 && param->axis_ != 3) {
|
||||
MS_LOG(ERROR) << "only support axis=0 or axis=3";
|
||||
if (param->axis_ < 0 || param->axis_ > 3) {
|
||||
MS_LOG(ERROR) << " only support axis >= 0 and axis <= 3 ";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
if (param->axis_ == 0) {
|
||||
return RET_OK;
|
||||
}
|
||||
if (in_tensors_.size() == 2) {
|
||||
std::set<std::string> build_options;
|
||||
std::string source = concat_source;
|
||||
|
@ -94,33 +91,6 @@ int ConcatOpenCLKernel::Init() {
|
|||
|
||||
int ConcatOpenCLKernel::ReSize() { return RET_OK; }
|
||||
|
||||
int ConcatOpenCLKernel::Run_axis0() {
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto allocator_ = ocl_runtime->GetAllocator();
|
||||
cl::CommandQueue *command_queue = ocl_runtime->GetDefaultCommandQueue();
|
||||
|
||||
for (auto &tensor : in_tensors_) {
|
||||
auto buffer = static_cast<cl::Buffer *>(allocator_->GetBuffer(tensor->Data()));
|
||||
ocl_runtime->MapBuffer(*buffer, CL_MAP_READ, tensor->Size(), command_queue, true);
|
||||
}
|
||||
for (auto &tensor : out_tensors_) {
|
||||
auto buffer = static_cast<cl::Buffer *>(allocator_->GetBuffer(tensor->Data()));
|
||||
ocl_runtime->MapBuffer(*buffer, CL_MAP_WRITE, tensor->Size(), command_queue, true);
|
||||
}
|
||||
|
||||
memcpy(out_tensors_[0]->Data(), in_tensors_[0]->Data(), in_tensors_[0]->Size());
|
||||
memcpy(reinterpret_cast<char *>(out_tensors_[0]->Data()) + in_tensors_[0]->Size(), in_tensors_[1]->Data(),
|
||||
in_tensors_[1]->Size());
|
||||
|
||||
for (auto tensors : {&in_tensors_, &out_tensors_}) {
|
||||
for (auto &tensor : *tensors) {
|
||||
auto buffer = static_cast<cl::Buffer *>(allocator_->GetBuffer(tensor->Data()));
|
||||
ocl_runtime->UnmapBuffer(*buffer, tensor->Data());
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConcatGetBiggestDividerWithPriority(int number, int max_divider) {
|
||||
if (number % 8 == 0 && max_divider >= 8) {
|
||||
return number / 8;
|
||||
|
@ -156,19 +126,17 @@ void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *
|
|||
int ConcatOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
|
||||
if (param->axis_ == 0) {
|
||||
return Run_axis0();
|
||||
}
|
||||
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto input0_shape = in_tensors_[0]->shape();
|
||||
auto input1_shape = in_tensors_[1]->shape();
|
||||
auto input1_shape = in_tensors_[0]->shape();
|
||||
auto input2_shape = in_tensors_[1]->shape();
|
||||
auto output_shape = out_tensors_[0]->shape();
|
||||
|
||||
cl_int2 input0_shape2_ = {UP_DIV(input0_shape[3], C4NUM), UP_DIV(input1_shape[3], C4NUM)}; // change
|
||||
cl_int4 input_shape1_ = {input1_shape[0], input1_shape[1], input1_shape[2], UP_DIV(input1_shape[3], C4NUM)};
|
||||
cl_int4 input_shape2_ = {input2_shape[0], input2_shape[1], input2_shape[2], UP_DIV(input2_shape[3], C4NUM)};
|
||||
cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)};
|
||||
|
||||
uint32_t OH = output_shape[1]; // N*H
|
||||
uint32_t OH = output_shape[0] * output_shape[1]; // N*H
|
||||
uint32_t OW = output_shape[2];
|
||||
uint32_t OC = UP_DIV(output_shape[3], C4NUM);
|
||||
|
||||
|
@ -182,20 +150,25 @@ int ConcatOpenCLKernel::Run() {
|
|||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input0_shape2_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->axis_);
|
||||
} else if (in_tensors_.size() == 3) {
|
||||
auto input2_shape = in_tensors_[2]->shape();
|
||||
cl_int3 input0_shape3_ = {UP_DIV(input0_shape[3], C4NUM), UP_DIV(input1_shape[3], C4NUM),
|
||||
UP_DIV(input2_shape[3], C4NUM)};
|
||||
auto input3_shape = in_tensors_[2]->shape();
|
||||
cl_int4 input_shape3_ = {input3_shape[0], input3_shape[1], input3_shape[2], UP_DIV(input3_shape[3], C4NUM)};
|
||||
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input0_shape3_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape3_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->axis_);
|
||||
} else if (in_tensors_.size() < 2) {
|
||||
MS_LOG(ERROR) << " inputs must been >=2";
|
||||
MS_LOG(ERROR) << " input sizes must >= 2 ";
|
||||
return RET_ERROR;
|
||||
} else {
|
||||
MS_LOG(ERROR) << " only support inputs <= 3 ";
|
||||
|
@ -218,7 +191,7 @@ kernel::LiteKernel *OpenCLConcatKernelCreator(const std::vector<lite::tensor::Te
|
|||
}
|
||||
auto ret = kernel->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Init kernel failed, name: Convolution";
|
||||
MS_LOG(ERROR) << " Init kernel failed, name: Concat ";
|
||||
delete kernel;
|
||||
return nullptr;
|
||||
}
|
||||
|
|
|
@ -37,8 +37,6 @@ class ConcatOpenCLKernel : public OpenCLKernel {
|
|||
|
||||
int ReSize() override;
|
||||
|
||||
int Run_axis0();
|
||||
|
||||
int Run() override;
|
||||
int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
|
||||
|
||||
|
|
|
@ -136,7 +136,7 @@ kernel::LiteKernel *OpenCLSliceKernelCreator(const std::vector<lite::tensor::Ten
|
|||
}
|
||||
auto ret = kernel->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Init kernel failed, name: Convolution";
|
||||
MS_LOG(ERROR) << " Init kernel failed, name: Slice ";
|
||||
delete kernel;
|
||||
return nullptr;
|
||||
}
|
||||
|
|
|
@ -94,7 +94,7 @@ int ToFormatOpenCLKernel::InitNHWCShape() {
|
|||
int ToFormatOpenCLKernel::ReSize() { return RET_OK; }
|
||||
|
||||
int ToFormatOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *global_size) {
|
||||
std::vector<size_t> vec = {nhwc_shape_[1], nhwc_shape_[2], UP_DIV(nhwc_shape_[3], C4NUM)};
|
||||
std::vector<size_t> vec = {nhwc_shape_[0] * nhwc_shape_[1], nhwc_shape_[2], UP_DIV(nhwc_shape_[3], C4NUM)};
|
||||
*global_size = std::move(vec);
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -107,13 +107,13 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size
|
|||
size_t im_dst_x, im_dst_y;
|
||||
std::vector<int> shapex = out_tensors_[0]->shape();
|
||||
if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
|
||||
int c = shapex[1];
|
||||
int h = shapex[2];
|
||||
int c = shapex[1] * shapex[2];
|
||||
int h = shapex[0];
|
||||
int w = shapex[3];
|
||||
im_dst_y = h * UP_DIV(c, C4NUM);
|
||||
im_dst_x = w;
|
||||
} else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
|
||||
int h = shapex[1];
|
||||
int h = shapex[0] * shapex[1];
|
||||
int w = shapex[2];
|
||||
int c = shapex[3];
|
||||
im_dst_x = w * UP_DIV(c, C4NUM);
|
||||
|
|
|
@ -32,13 +32,6 @@ class TestBatchnormOpenCLfp16 : public mindspore::CommonTest {
|
|||
TestBatchnormOpenCLfp16() {}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void CompareOutputData1(T *output_data, T *correct_data, int size, float err_bound) {
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
T abs = fabs(output_data[i] - correct_data[i]);
|
||||
ASSERT_LE(abs, err_bound);
|
||||
}
|
||||
}
|
||||
TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
|
||||
MS_LOG(INFO) << "begin test";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
|
@ -59,7 +52,7 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
|
|||
std::string var_path = "./test_data/batchnorm_varfp16.bin";
|
||||
std::string offset_path = "./test_data/batchnorm_offsetfp16.bin";
|
||||
std::string scale_path = "./test_data/batchnorm_scalefp16.bin";
|
||||
std::string output_path = "./test_data/batchnorm_out_datafp16.bin";
|
||||
std::string output_path = "./test_data/batchnorm_correctdatafp16.bin";
|
||||
auto input_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
|
||||
auto correct_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
|
||||
size_t mean_size, var_size, scale_size, offset_size;
|
||||
|
@ -148,7 +141,7 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
|
|||
sub_graph->Run();
|
||||
|
||||
auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->Data());
|
||||
CompareOutputData1(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001);
|
||||
CompareOutputData(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.01);
|
||||
for (auto tensor : inputs) {
|
||||
delete tensor;
|
||||
}
|
||||
|
@ -158,7 +151,6 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
|
|||
delete param;
|
||||
delete batchnorm_kernel;
|
||||
delete sub_graph;
|
||||
lite::opencl::OpenCLRuntime::DeleteInstance();
|
||||
}
|
||||
TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) {
|
||||
MS_LOG(INFO) << " begin test ";
|
||||
|
@ -268,7 +260,7 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) {
|
|||
sub_graph->Run();
|
||||
|
||||
auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->Data());
|
||||
CompareOutputData1(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001);
|
||||
CompareOutputData(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001);
|
||||
for (auto tensor : inputs) {
|
||||
delete tensor;
|
||||
}
|
||||
|
@ -278,6 +270,5 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) {
|
|||
delete param;
|
||||
delete batchnorm_kernel;
|
||||
delete sub_graph;
|
||||
lite::opencl::OpenCLRuntime::DeleteInstance();
|
||||
}
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -18,70 +18,10 @@
|
|||
#include "utils/log_adapter.h"
|
||||
#include "common/common_test.h"
|
||||
#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
|
||||
#include "mindspore/lite/src/common/file_utils.h"
|
||||
#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
|
||||
#include "mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h"
|
||||
|
||||
template <typename T>
|
||||
void ConcatComputeByCPU_2input_dim4_axis3(const T *input0, const T *input1, T *output, std::vector<int> input_shape0,
|
||||
std::vector<int> input_shape1, std::vector<int> output_shape,
|
||||
const int axis) {
|
||||
int postion, index0 = 0, index1 = 0;
|
||||
for (int i = 0; i < output_shape[0]; i++) {
|
||||
for (int j = 0; j < output_shape[1]; j++) {
|
||||
for (int k = 0; k < output_shape[2]; k++) {
|
||||
postion = i * output_shape[1] * output_shape[2] * output_shape[3] + j * output_shape[2] * output_shape[3] +
|
||||
k * output_shape[3];
|
||||
for (int w = 0; w < output_shape[3]; w++) {
|
||||
if (w < input_shape0[3] + input_shape1[3]) {
|
||||
output[postion++] = (w < input_shape0[3]) ? input0[index0++] : input1[index1++];
|
||||
} else {
|
||||
for (int ind = input_shape0[3] + input_shape1[3]; ind < output_shape[3]; ind++) {
|
||||
output[postion++] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
void ConcatComputeByCPU_3input_dim4_axis3(T *input0, T *input1, T *input2, T *output, std::vector<int> input_shape0,
|
||||
std::vector<int> input_shape1, std::vector<int> input_shape2,
|
||||
std::vector<int> output_shape, const int axis) {
|
||||
int postion, index0 = 0, index1 = 0, index2 = 0;
|
||||
for (int i = 0; i < output_shape[0]; i++) {
|
||||
for (int j = 0; j < output_shape[1]; j++) {
|
||||
for (int k = 0; k < output_shape[2]; k++) {
|
||||
postion = i * output_shape[1] * output_shape[2] * output_shape[3] + j * output_shape[2] * output_shape[3] +
|
||||
k * output_shape[3];
|
||||
for (int w = 0; w < output_shape[3]; w++) {
|
||||
if (w < input_shape0[3]) {
|
||||
int align = UP_DIV(input_shape0[3], 4) * 4;
|
||||
index0 = i * input_shape0[1] * input_shape0[2] * align + j * input_shape0[2] * align + k * align + w;
|
||||
output[postion++] = input0[index0];
|
||||
} else if (w >= input_shape0[3] && w < (input_shape0[3] + input_shape1[3])) {
|
||||
int align = UP_DIV(input_shape1[3], 4) * 4;
|
||||
index1 = i * input_shape1[1] * input_shape1[2] * align + j * input_shape1[2] * align + k * align + w -
|
||||
input_shape0[3];
|
||||
output[postion++] = input1[index1];
|
||||
} else if ((input_shape0[3] + input_shape1[3]) <= w &&
|
||||
w < (input_shape0[3] + input_shape1[3] + input_shape2[3])) {
|
||||
int align = UP_DIV(input_shape2[3], 4) * 4;
|
||||
index2 = i * input_shape2[1] * input_shape2[2] * align + j * input_shape2[2] * align + k * align + w -
|
||||
input_shape0[3] - input_shape1[3];
|
||||
output[postion++] = input2[index2];
|
||||
} else {
|
||||
for (int ind = input_shape0[3] + input_shape1[3] + input_shape2[3]; ind < output_shape[3]; ind++) {
|
||||
output[postion++] = 0;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace mindspore {
|
||||
class TestConcatOpenCLfp32 : public mindspore::CommonTest {
|
||||
public:
|
||||
|
@ -106,11 +46,23 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) {
|
|||
ocl_runtime->Init();
|
||||
auto allocator = ocl_runtime->GetAllocator();
|
||||
|
||||
// get the input from .bin
|
||||
size_t input1_size, input2_size, input3_size, output_size;
|
||||
std::string input1Ppath = "./test_data/concatfp16_input1.bin";
|
||||
std::string input2Ppath = "./test_data/concatfp16_input2.bin";
|
||||
std::string input3Ppath = "./test_data/concatfp16_input3.bin";
|
||||
std::string correctOutputPath = "./test_data/concatfp16_output.bin";
|
||||
auto input_data1 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size));
|
||||
auto input_data2 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size));
|
||||
auto input_data3 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input3Ppath.c_str(), &input3_size));
|
||||
auto correctOutput =
|
||||
reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
|
||||
|
||||
MS_LOG(INFO) << " init tensors ";
|
||||
constexpr int INPUT_NUM = 3;
|
||||
std::array<std::vector<int>, INPUT_NUM> input_shapes = {
|
||||
std::vector<int>{1, 16, 256, 80}, std::vector<int>{1, 16, 256, 80}, std::vector<int>{1, 16, 256, 80}};
|
||||
std::vector<int> output_shape = {1, 16, 256, 240};
|
||||
constexpr int INPUT_NUM = 2;
|
||||
std::array<std::vector<int>, INPUT_NUM> input_shapes = {std::vector<int>{1, 19, 19, 96},
|
||||
std::vector<int>{1, 19, 19, 96}};
|
||||
std::vector<int> output_shape = {2, 19, 19, 96};
|
||||
auto data_type = kNumberTypeFloat16;
|
||||
auto tensor_type = schema::NodeType_ValueNode;
|
||||
std::vector<lite::tensor::Tensor *> inputs;
|
||||
|
@ -146,7 +98,7 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) {
|
|||
}
|
||||
return;
|
||||
}
|
||||
param->axis_ = 3;
|
||||
param->axis_ = 0;
|
||||
auto *concat_kernel =
|
||||
new (std::nothrow) kernel::ConcatOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
|
||||
if (concat_kernel == nullptr) {
|
||||
|
@ -165,7 +117,6 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) {
|
|||
for (auto &input_tensor : inputs) {
|
||||
input_tensor->MallocData(allocator);
|
||||
}
|
||||
|
||||
MS_LOG(INFO) << " initialize sub_graph ";
|
||||
std::vector<kernel::LiteKernel *> kernels{concat_kernel};
|
||||
auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
|
||||
|
@ -182,33 +133,22 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) {
|
|||
return;
|
||||
}
|
||||
sub_graph->Init();
|
||||
unsigned int seed = 123;
|
||||
MS_LOG(INFO) << " initialize input data ";
|
||||
for (auto &input_tensor : inputs) {
|
||||
auto input_data = reinterpret_cast<float16_t *>(input_tensor->Data());
|
||||
for (int i = 0; i < input_tensor->ElementsNum(); ++i) {
|
||||
input_data[i] = static_cast<float16_t>(rand_r(&seed) % 10 + 1);
|
||||
}
|
||||
}
|
||||
|
||||
// compute the result for CPU
|
||||
auto *input_data0 = reinterpret_cast<float16_t *>(inputs[0]->Data());
|
||||
auto *input_data1 = reinterpret_cast<float16_t *>(inputs[1]->Data());
|
||||
std::vector<float16_t> output_data_cpu(output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3]);
|
||||
if (inputs.size() == 2) {
|
||||
ConcatComputeByCPU_2input_dim4_axis3(input_data0, input_data1, output_data_cpu.data(), input_shapes[0],
|
||||
input_shapes[1], output_shape, param->axis_);
|
||||
}
|
||||
if (inputs.size() == 3) {
|
||||
auto *input_data2 = reinterpret_cast<float16_t *>(inputs[2]->Data());
|
||||
ConcatComputeByCPU_3input_dim4_axis3(input_data0, input_data1, input_data2, output_data_cpu.data(), input_shapes[0],
|
||||
input_shapes[1], input_shapes[2], output_shape, param->axis_);
|
||||
memcpy(inputs[0]->Data(), input_data1, input1_size);
|
||||
memcpy(inputs[1]->Data(), input_data2, input2_size);
|
||||
} else if (inputs.size() == 3) {
|
||||
memcpy(inputs[0]->Data(), input_data1, input1_size);
|
||||
memcpy(inputs[1]->Data(), input_data2, input2_size);
|
||||
memcpy(inputs[2]->Data(), input_data3, input3_size);
|
||||
} else {
|
||||
MS_LOG(ERROR) << " input size must be 2 or 3";
|
||||
}
|
||||
|
||||
std::cout << "==================output data================" << std::endl;
|
||||
sub_graph->Run();
|
||||
auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->Data());
|
||||
CompareOutputData1(output_data_gpu, output_data_cpu.data(), output_tensor->ElementsNum(), 0.00001);
|
||||
CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001);
|
||||
for (auto tensor : inputs) {
|
||||
delete tensor;
|
||||
}
|
||||
|
@ -218,7 +158,6 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) {
|
|||
delete param;
|
||||
delete concat_kernel;
|
||||
delete sub_graph;
|
||||
lite::opencl::OpenCLRuntime::DeleteInstance();
|
||||
}
|
||||
|
||||
TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) {
|
||||
|
@ -227,16 +166,27 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) {
|
|||
ocl_runtime->Init();
|
||||
auto allocator = ocl_runtime->GetAllocator();
|
||||
|
||||
// get the input from .bin
|
||||
size_t input1_size, input2_size, input3_size, output_size;
|
||||
std::string input1Ppath = "./test_data/concat_input1.bin";
|
||||
std::string input2Ppath = "./test_data/concat_input2.bin";
|
||||
std::string input3Ppath = "./test_data/concat_input3.bin";
|
||||
std::string correctOutputPath = "./test_data/concat_output.bin";
|
||||
auto input_data1 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size));
|
||||
auto input_data2 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size));
|
||||
auto input_data3 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input3Ppath.c_str(), &input3_size));
|
||||
auto correctOutput = reinterpret_cast<float *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
|
||||
|
||||
MS_LOG(INFO) << " init tensors ";
|
||||
constexpr int INPUT_NUM = 3;
|
||||
std::array<std::vector<int>, INPUT_NUM> input_shapes = {
|
||||
std::vector<int>{1, 16, 256, 80}, std::vector<int>{1, 16, 256, 80}, std::vector<int>{1, 16, 256, 80}};
|
||||
std::vector<int> output_shape = {1, 16, 256, 240};
|
||||
std::vector<int> output_shape = {1, 48, 256, 80};
|
||||
auto data_type = kNumberTypeFloat32;
|
||||
auto tensor_type = schema::NodeType_ValueNode;
|
||||
std::vector<lite::tensor::Tensor *> inputs;
|
||||
for (auto &shape : input_shapes) {
|
||||
auto input_temp = new (std::nothrow) lite::tensor::Tensor(data_type, shape, schema::Format_NHWC4, tensor_type);
|
||||
auto input_temp = new (std::nothrow) lite::tensor::Tensor(data_type, shape, schema::Format_NHWC, tensor_type);
|
||||
inputs.push_back(input_temp);
|
||||
if (input_temp == nullptr) {
|
||||
MS_LOG(INFO) << " new input_tensor failed ";
|
||||
|
@ -244,7 +194,7 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) {
|
|||
}
|
||||
}
|
||||
auto *output_tensor =
|
||||
new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type);
|
||||
new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type);
|
||||
if (output_tensor == nullptr) {
|
||||
MS_LOG(INFO) << " new output_tensor failed ";
|
||||
for (auto tensor : inputs) {
|
||||
|
@ -267,7 +217,7 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) {
|
|||
}
|
||||
return;
|
||||
}
|
||||
param->axis_ = 3;
|
||||
param->axis_ = 1;
|
||||
auto *concat_kernel =
|
||||
new (std::nothrow) kernel::ConcatOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
|
||||
if (concat_kernel == nullptr) {
|
||||
|
@ -303,33 +253,22 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) {
|
|||
return;
|
||||
}
|
||||
sub_graph->Init();
|
||||
unsigned int seed = 123;
|
||||
MS_LOG(INFO) << " initialize input data ";
|
||||
for (auto &input_tensor : inputs) {
|
||||
auto input_data = reinterpret_cast<float *>(input_tensor->Data());
|
||||
for (int i = 0; i < input_tensor->ElementsNum(); ++i) {
|
||||
input_data[i] = static_cast<float>(rand_r(&seed) % 10 + 1);
|
||||
}
|
||||
}
|
||||
|
||||
// compute the result for CPU
|
||||
auto *input_data0 = reinterpret_cast<float *>(inputs[0]->Data());
|
||||
auto *input_data1 = reinterpret_cast<float *>(inputs[1]->Data());
|
||||
std::vector<float> output_data_cpu(output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3]);
|
||||
if (inputs.size() == 2) {
|
||||
ConcatComputeByCPU_2input_dim4_axis3(input_data0, input_data1, output_data_cpu.data(), input_shapes[0],
|
||||
input_shapes[1], output_shape, param->axis_);
|
||||
}
|
||||
if (inputs.size() == 3) {
|
||||
auto *input_data2 = reinterpret_cast<float *>(inputs[2]->Data());
|
||||
ConcatComputeByCPU_3input_dim4_axis3(input_data0, input_data1, input_data2, output_data_cpu.data(), input_shapes[0],
|
||||
input_shapes[1], input_shapes[2], output_shape, param->axis_);
|
||||
memcpy(inputs[0]->Data(), input_data1, input1_size);
|
||||
memcpy(inputs[1]->Data(), input_data2, input2_size);
|
||||
} else if (inputs.size() == 3) {
|
||||
memcpy(inputs[0]->Data(), input_data1, input1_size);
|
||||
memcpy(inputs[1]->Data(), input_data2, input2_size);
|
||||
memcpy(inputs[2]->Data(), input_data3, input3_size);
|
||||
} else {
|
||||
MS_LOG(ERROR) << " input size must be 2 or 3 ";
|
||||
}
|
||||
|
||||
std::cout << "==================output data================" << std::endl;
|
||||
sub_graph->Run();
|
||||
auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->Data());
|
||||
CompareOutputData1(output_data_gpu, output_data_cpu.data(), output_tensor->ElementsNum(), 0.00001);
|
||||
CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001);
|
||||
for (auto tensor : inputs) {
|
||||
delete tensor;
|
||||
}
|
||||
|
@ -339,6 +278,5 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) {
|
|||
delete param;
|
||||
delete concat_kernel;
|
||||
delete sub_graph;
|
||||
lite::opencl::OpenCLRuntime::DeleteInstance();
|
||||
}
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -47,20 +47,19 @@ TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) {
|
|||
auto allocator = ocl_runtime->GetAllocator();
|
||||
|
||||
MS_LOG(INFO) << " Read tensors from .bin ";
|
||||
std::vector<int> input_shape = {1, 256, 256, 48};
|
||||
std::vector<int> output_shape = {1, 255, 255, 15};
|
||||
std::vector<int> begin = {0, 1, 1, 7};
|
||||
std::vector<int> size = {1, 255, 255, 15};
|
||||
std::vector<int> input_shape = {1, 19, 19, 96};
|
||||
std::vector<int> output_shape = {1, 10, 10, 13};
|
||||
std::vector<int> begin = {0, 2, 3, 4};
|
||||
std::vector<int> size = {1, 10, 10, 13};
|
||||
auto data_type = kNumberTypeFloat32;
|
||||
auto tensor_type = schema::NodeType_ValueNode;
|
||||
|
||||
// get the input from .bin
|
||||
size_t input_size, output_size;
|
||||
std::string input_path = "./test_data/in_datafp32.bin";
|
||||
std::string output_path = "./test_data/out_datafp32.bin";
|
||||
std::string input_path = "./test_data/in_slicefp32.bin";
|
||||
std::string output_path = "./test_data/out_slicefp32.bin";
|
||||
auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
|
||||
auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
|
||||
|
||||
MS_LOG(INFO) << " construct tensors ";
|
||||
lite::tensor::Tensor *tensor_data =
|
||||
new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type);
|
||||
|
@ -148,7 +147,6 @@ TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) {
|
|||
}
|
||||
delete slice_kernel;
|
||||
delete sub_graph;
|
||||
lite::opencl::OpenCLRuntime::DeleteInstance();
|
||||
}
|
||||
TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) {
|
||||
MS_LOG(INFO) << " begin test ";
|
||||
|
@ -201,7 +199,7 @@ TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) {
|
|||
MS_LOG(INFO) << " new SliceParameter failed ";
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (int i = 0; i < input_shape.size(); i++) {
|
||||
param->begin_[i] = begin[i];
|
||||
param->size_[i] = size[i];
|
||||
}
|
||||
|
@ -259,6 +257,5 @@ TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) {
|
|||
}
|
||||
delete slice_kernel;
|
||||
delete sub_graph;
|
||||
lite::opencl::OpenCLRuntime::DeleteInstance();
|
||||
}
|
||||
} // namespace mindspore
|
||||
|
|
Loading…
Reference in New Issue