transpose output set to buffer

This commit is contained in:
chenzupeng 2020-08-17 20:56:16 +08:00
parent 19c800a758
commit 4d3be49a66
14 changed files with 134 additions and 13 deletions

View File

@ -3,7 +3,7 @@
#define READ_IMAGE read_imagef
#define WRITE_IMAGE write_imagef
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void transpose(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
__kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= HW.y || Y >= C.y) {
@ -43,3 +43,44 @@ __kernel void transpose(__read_only image2d_t src_data, __write_only image2d_t d
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 2), result[2]);
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]);
}
__kernel void transpose_BUF(__read_only image2d_t src_data, global FLT4 *dst_data, int2 HW, int2 C) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= HW.y || Y >= C.y) {
return;
}
FLT4 result[4];
result[0] = (FLT4)(0.0f);
result[1] = (FLT4)(0.0f);
result[2] = (FLT4)(0.0f);
result[3] = (FLT4)(0.0f);
FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X));
FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 1));
FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 2));
FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 3));
result[0].x = x0.x;
result[0].y = x1.x;
result[0].z = x2.x;
result[0].w = x3.x;
result[1].x = x0.y;
result[1].y = x1.y;
result[1].z = x2.y;
result[1].w = x3.y;
result[2].x = x0.z;
result[2].y = x1.z;
result[2].z = x2.z;
result[2].w = x3.z;
result[3].x = x0.w;
result[3].y = x1.w;
result[3].z = x2.w;
result[3].w = x3.w;
dst_data[4 * Y * HW.y + X] = result[0];
dst_data[(4 * Y + 1) * HW.y + X] = result[1];
dst_data[(4 * Y + 2) * HW.y + X] = result[2];
dst_data[(4 * Y + 3) * HW.y + X] = result[3];
}

View File

@ -36,7 +36,11 @@ namespace mindspore::kernel {
int TransposeOpenCLKernel::Init() {
std::string kernel_name = "transpose";
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
if (!is_image_out_) {
kernel_name += "_BUF";
} else {
kernel_name += "_IMG";
}
#ifdef PROGRAM_WITH_IL
ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
#else
@ -60,8 +64,12 @@ int TransposeOpenCLKernel::Init() {
MS_LOG(ERROR) << "input H * W % 4 != 0 not support!";
return RET_ERROR;
}
ori_format_ = out_tensors_[0]->GetFormat();
// Transpose::InferShape just set output->SetFormat(input->GetFormat()); -^-!
ori_format_ = schema::Format_NCHW;
out_tensors_[0]->SetFormat(schema::Format_NCHW);
if (!is_image_out_) {
out_mem_type_ = OpenCLMemType::BUF;
}
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}

View File

@ -38,6 +38,7 @@ class TransposeOpenCLKernel : public OpenCLKernel {
private:
cl::Kernel kernel_;
bool is_image_out_ = false;
};
} // namespace mindspore::kernel

View File

@ -36,11 +36,11 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::tensor::Tensor *
for (size_t i = 0; i < in_tensors.size(); ++i) {
OpenCLKernel *cur_opencl_op = reinterpret_cast<OpenCLKernel *>(in_kernels[i]);
schema::Format ori_format = cur_opencl_op->GetOriFormat();
if (mem_type == cur_opencl_op->GetMemType() && in_tensors[i]->GetFormat() == ori_format) {
if (mem_type == OpenCLMemType::BUF && mem_type == cur_opencl_op->GetMemType() &&
in_tensors[i]->GetFormat() == ori_format) {
continue;
}
auto dst_format =
(mem_type == OpenCLMemType::IMG) ? in_kernels[i]->out_tensors()[0]->GetFormat() : ori_format;
auto dst_format = (mem_type == OpenCLMemType::IMG) ? in_kernels[i]->out_tensors()[0]->GetFormat() : ori_format;
auto src_format =
(mem_type == OpenCLMemType::IMG) ? in_tensors[i]->GetFormat() : in_kernels[i]->out_tensors()[0]->GetFormat();
lite::tensor::Tensor *new_tensor = new (std::nothrow) lite::tensor::Tensor();

View File

@ -125,6 +125,7 @@ int RunSubGraphOpenCLKernel(const std::vector<lite::tensor::Tensor *> &inputs,
MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error.";
return RET_ERROR;
}
delete sub_graph;
return RET_OK;
}
@ -180,6 +181,7 @@ TEST_F(TestActivationOpenCL, LeakyReluFp32_dim4) {
delete input_tensor;
delete output_tensor;
lite::opencl::OpenCLRuntime::DeleteInstance();
return;
}
} // namespace mindspore

View File

@ -119,6 +119,11 @@ TEST_F(TestAvgPoolingOpenCL, AvgPoolFp32) {
}
printf("test all close OK!\n");
lite::CompareOutputData(output_data, expect, 4);
delete tensor_in;
delete tensor_out;
delete pooling_kernel;
delete pGraph;
lite::opencl::OpenCLRuntime::DeleteInstance();
}
} // namespace mindspore

View File

@ -175,6 +175,14 @@ TEST_F(TestConcatOpenCL, ConcatFp32_2input_dim4_axis3) {
sub_graph->Run();
auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->Data());
CompareOutputData1(output_data_gpu, output_data_cpu.data(), output_tensor->ElementsNum(), 0.00001);
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete concat_kernel;
delete sub_graph;
lite::opencl::OpenCLRuntime::DeleteInstance();
}
} // namespace mindspore

View File

@ -108,5 +108,14 @@ TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp32) {
CompareOutputData(output_data, correct_data, oh * ow * co, 0.00001);
MS_LOG(INFO) << "Test Conv2dTransposeFp32 passed";
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete arith_kernel;
delete pGraph;
lite::opencl::OpenCLRuntime::DeleteInstance();
}
} // namespace mindspore

View File

@ -120,6 +120,14 @@ void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::stri
MyCompareOutput(output_tensor, expect_file);
// lite::CompareOutput(reinterpret_cast<float *>(output_tensor->Data()), expect_file);
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete conv_kernel;
delete sub_graph;
mindspore::lite::opencl::OpenCLRuntime::DeleteInstance();
}

View File

@ -75,12 +75,15 @@ TEST_F(TestMatMulOpenCL, MatMulFp32) {
// compare
CompareOutputData(output_data, correct_data, co, 0.00001);
delete input_data;
delete weight_data;
delete tensor_x;
delete tensor_w;
delete tensor_out;
delete correct_data;
MS_LOG(INFO) << "TestMatMulFp32 passed";
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete arith_kernel;
delete pGraph;
lite::opencl::OpenCLRuntime::DeleteInstance();
}
} // namespace mindspore

View File

@ -92,6 +92,15 @@ TEST_F(TestMaxPoolingOpenCL, MaxPool_1_32_512_96) {
MS_LOG(INFO) << "compare result";
std::cout << "compare result" << std::endl;
CompareOutput(output_tensor, expect_file);
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete pooling_kernel;
delete pGraph;
lite::opencl::OpenCLRuntime::DeleteInstance();
}
} // namespace mindspore

View File

@ -77,6 +77,15 @@ void RunTestCase(std::vector<int> input_shape, std::vector<int> output_shape, st
MS_LOG(INFO) << "compare result";
std::cout << "compare result" << std::endl;
CompareOutput(output_tensor, expect_file);
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete kernel;
delete pGraph;
lite::opencl::OpenCLRuntime::DeleteInstance();
}
TEST_F(TestSoftmaxOpenCL, Softmax_1) {

View File

@ -75,5 +75,14 @@ TEST_F(TestToFormatOpenCL, TransposeFp32) {
// compare
CompareOutputData(output_data, correct_data, h * w * c, 0.00001);
MS_LOG(INFO) << "TestMatMulFp32 passed";
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete arith_kernel;
delete pGraph;
lite::opencl::OpenCLRuntime::DeleteInstance();
}
} // namespace mindspore

View File

@ -75,5 +75,14 @@ TEST_F(TestTransposeOpenCL, TransposeFp32) {
// compare
CompareOutputData(output_data, correct_data, h * w * c, 0.00001);
MS_LOG(INFO) << "TestMatMulFp32 passed";
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete arith_kernel;
delete pGraph;
lite::opencl::OpenCLRuntime::DeleteInstance();
}
} // namespace mindspore