From 4cd94a4466a70d114cc9b41a9b4a8dfc61d0f01b Mon Sep 17 00:00:00 2001 From: chenzupeng Date: Mon, 10 Aug 2020 15:53:12 +0800 Subject: [PATCH] transpose ouptut image2d --- .../runtime/kernel/opencl/cl/fp16/transpose.cl | 11 ++++++----- .../runtime/kernel/opencl/cl/fp32/transpose.cl | 11 ++++++----- .../runtime/kernel/opencl/kernel/arithmetic.cc | 6 +++--- .../src/runtime/kernel/opencl/kernel/transpose.cc | 15 +++++++++++++++ .../src/runtime/kernel/opencl/kernel/transpose.h | 2 +- mindspore/lite/test/st/benchmark_test.cc | 9 +++++++++ .../src/runtime/kernel/opencl/transpose_tests.cc | 3 +++ 7 files changed, 43 insertions(+), 14 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl index ebc3db633fb..73c4c076e76 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl @@ -1,8 +1,9 @@ #define FLT half #define FLT4 half4 #define READ_IMAGE read_imageh +#define WRITE_IMAGE write_imageh __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void transpose(__read_only image2d_t src_data, __global float4 *dst_data, int2 HW, int2 C) { +__kernel void transpose(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= HW.y || Y >= C.y) { @@ -37,8 +38,8 @@ __kernel void transpose(__read_only image2d_t src_data, __global float4 *dst_dat result[3].z = x2.w; result[3].w = x3.w; - dst_data[4 * Y * HW.y + X] = result[0]; - dst_data[(4 * Y + 1) * HW.y + X] = result[1]; - dst_data[(4 * Y + 2) * HW.y + X] = result[2]; - dst_data[(4 * Y + 3) * HW.y + X] = result[3]; + WRITE_IMAGE(dst_data, (int2)(X, 4 * Y), result[0]); + WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 1), result[1]); + WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 2), result[2]); + WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/transpose.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/transpose.cl index 08069ee80ff..63def496440 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/transpose.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/transpose.cl @@ -1,8 +1,9 @@ #define FLT float #define FLT4 float4 #define READ_IMAGE read_imagef +#define WRITE_IMAGE write_imagef __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void transpose(__read_only image2d_t src_data, __global float4 *dst_data, int2 HW, int2 C) { +__kernel void transpose(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= HW.y || Y >= C.y) { @@ -37,8 +38,8 @@ __kernel void transpose(__read_only image2d_t src_data, __global float4 *dst_dat result[3].z = x2.w; result[3].w = x3.w; - dst_data[4 * Y * HW.y + X] = result[0]; - dst_data[(4 * Y + 1) * HW.y + X] = result[1]; - dst_data[(4 * Y + 2) * HW.y + X] = result[2]; - dst_data[(4 * Y + 3) * HW.y + X] = result[3]; + WRITE_IMAGE(dst_data, (int2)(X, 4 * Y), result[0]); + WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 1), result[1]); + WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 2), result[2]); + WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc index 96ee4b37be6..36acee68695 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc @@ -43,7 +43,7 @@ void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() { size_t H = outputs_[0]->Batch() * outputs_[0]->Height(); size_t W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM); local_size_ = {16, 16}; - global_size_ = {H, W}; + global_size_ = {W, H}; } void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() { @@ -140,7 +140,7 @@ int ArithmeticOpenCLKernel::Run() { bias_ = -1 * value; break; case PrimitiveType_Div: - bias_ = 1 / value; + weight_ = 1 / value; break; default: MS_LOG(ERROR) << "Error Operator type " << opParameter->type_; @@ -152,7 +152,7 @@ int ArithmeticOpenCLKernel::Run() { runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data()); int H = outputs_[0]->Batch() * outputs_[0]->Height(); int W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM); - cl_int2 output_shape{H, W}; + cl_int2 output_shape{W, H}; runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); return 0; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc index 82bf48d58eb..677f99739bc 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc @@ -67,6 +67,21 @@ int TransposeOpenCLKernel::Init() { int TransposeOpenCLKernel::ReSize() { return 0; } +int TransposeOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { + size_t im_dst_x, im_dst_y; + im_dst_x = UP_DIV(outputs_[0]->Height() * outputs_[0]->Width(), C4NUM); + im_dst_y = outputs_[0]->Channel(); +#ifdef ENABLE_FP16 + size_t img_dtype = CL_HALF_FLOAT; +#else + size_t img_dtype = CL_FLOAT; +#endif + img_size->clear(); + std::vector vec{im_dst_x, im_dst_y, img_dtype}; + *img_size = vec; + return RET_OK; +} + int TransposeOpenCLKernel::Run() { MS_LOG(DEBUG) << this->Name() << " Running!"; std::vector shapex = inputs_[0]->shape(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h index c16557b5b0c..f79f9a273ad 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h @@ -35,7 +35,7 @@ class TransposeOpenCLKernel : public OpenCLKernel { int Init() override; int ReSize() override; int Run() override; - + int GetImageSize(size_t idx, std::vector *img_size) override; private: cl::Kernel kernel_; }; diff --git a/mindspore/lite/test/st/benchmark_test.cc b/mindspore/lite/test/st/benchmark_test.cc index 48dd6dd7db3..7454565a042 100644 --- a/mindspore/lite/test/st/benchmark_test.cc +++ b/mindspore/lite/test/st/benchmark_test.cc @@ -50,6 +50,15 @@ auto status = RunBenchmark(5, argv); ASSERT_EQ(status, RET_OK); } +TEST_F(BenchmarkTest, Test_MV2_GPU) { +const char *argv[] = {"./benchmark", "--modelPath=./hiai/mobilenet_v2.ms", + "--inDataPath=./hiai/mobilenet_v2_in.bin", + "--calibDataPath=./hiai/mobilenet_v2_out.bin", + "--device=GPU"}; +auto status = RunBenchmark(5, argv); +ASSERT_EQ(status, RET_OK); +} + TEST_F(BenchmarkTest, TestHebing) { const char *argv[] = {"./benchmark", "--modelPath=./hiai/model_hebing_3branch.ms", "--inDataPath=./hiai/model_hebing_3branch.bin", diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc index 20324e0cdb7..0f432ba5994 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc @@ -66,6 +66,9 @@ TEST_F(TestTransposeOpenCL, TransposeFp32) { size_n = size_n > 100 ? 100 : size_n; for (int i = 0; i < size_n; i++) { std::cout << output_data[i] << " "; + if ((i + 1) % c == 0) { + std::cout << std::endl; + } } std::cout << std::endl;