!5680 add gpu benchmark

Merge pull request !5680 from wandongdong/master
2020-09-03 21:51:26 +08:00 · 2020-09-03 21:51:26 +08:00 · 0aaaf29158
parent a50bc2182e 937dbf8b03
commit 0aaaf29158
9 changed files with 61 additions and 13 deletions
--- a/mindspore/lite/src/lite_kernel.h
+++ b/mindspore/lite/src/lite_kernel.h
@ -27,11 +27,6 @@
 #include "src/ir/tensor.h"
 #include "include/errorcode.h"

-#ifdef ENABLE_FP16
-using FLOAT_t = float16_t;
-#else
-using FLOAT_t = float;
-#endif

 // using mindspore::kernel::AddressPtr;
 namespace mindspore::kernel {
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
@ -55,10 +55,20 @@ __kernel void transpose_NHWC4_BUF(__read_only image2d_t src_data, global FLT4 *d
  result[1] = (FLT4)(0.0f);
  result[2] = (FLT4)(0.0f);
  result[3] = (FLT4)(0.0f);
-  FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X) % W * C.y + Y, (4 * X) / W));
-  FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 1) % W * C.y + Y, (4 * X + 1) / W));
-  FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 2) % W * C.y + Y, (4 * X + 2) / W));
-  FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 3) % W * C.y + Y, (4 * X + 3) / W));
+  bool over_size = W * C.y > 65535;
+  FLT4 x0, x1, x2, x3;
+  if (over_size) {
+    x0 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X));
+    x1 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X + 1));
+    x2 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X + 2));
+    x3 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X + 3));
+  } else {
+    x0 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X) % W * C.y + Y, (4 * X) / W));
+    x1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 1) % W * C.y + Y, (4 * X + 1) / W));
+    x2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 2) % W * C.y + Y, (4 * X + 2) / W));
+    x3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 3) % W * C.y + Y, (4 * X + 3) / W));
+  }
+
  result[0].x = x0.x;
  result[0].y = x1.x;
  result[0].z = x2.x;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
@ -44,7 +44,7 @@ void ActivationOpenClKernel::InitBuffer() {
  alpha_buff_ = allocator->MapBuffer(alpha_buff_, CL_MAP_WRITE, nullptr, true);
  memset(alpha_buff_, 0x00, fp_size);
  if (enable_fp16_) {
-    auto fp16 = (float16_t)alpha_;
+    auto fp16 = (int16_t)alpha_;
    memcpy(alpha_buff_, &fp16, fp_size);
  } else {
    memcpy(alpha_buff_, &alpha_, fp_size);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@ -72,7 +72,7 @@ void Conv2dTransposeOpenCLKernel::PadWeight() {
  int div_ci = UP_DIV(ci, C4NUM);
  int div_co = UP_DIV(co, C4NUM);
  auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
-  auto data_size = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
+  auto data_size = enable_fp16_ ? sizeof(int16_t) : sizeof(float);

  // IHWO to OHWI4(I)4(O)(converter format is IHWO)
  // init padWeight_(buffer mem)
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@ -75,7 +75,7 @@ int MatMulOpenCLKernel::ReSize() { return RET_OK; }
 void MatMulOpenCLKernel::PadWeight() {
  auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();

-  size_t dtype_size = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
+  size_t dtype_size = enable_fp16_ ? sizeof(int16_t) : sizeof(float);
  padWeight_ = allocator->Malloc(sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size);
  padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
  memset(padWeight_, 0x00, sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
@ -27,6 +27,7 @@ using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Reshape;
+using mindspore::schema::PrimitiveType_Squeeze;

 namespace mindspore::kernel {

@ -142,4 +143,6 @@ kernel::LiteKernel *OpenCLReshapeKernelCreator(const std::vector<lite::tensor::T

 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reshape, OpenCLReshapeKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reshape, OpenCLReshapeKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Squeeze, OpenCLReshapeKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Squeeze, OpenCLReshapeKernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/test/models_tflite_gpu.cfg
+++ b/mindspore/lite/test/models_tflite_gpu.cfg
@ -0,0 +1,3 @@
+mobilenet_v1_1.0_224.tflite
+mobilenet_v2_1.0_224.tflite
+resnet.tflite
--- a/mindspore/lite/test/run_benchmark_nets.sh
+++ b/mindspore/lite/test/run_benchmark_nets.sh
@ -310,6 +310,42 @@ function Run_arm64() {
        fi
 	#sleep 1
    done < ${models_tflite_awaretraining_config}
+
+    # Run gpu tflite converted models:
+    while read line; do
+        model_name=${line}
+        if [[ $model_name == \#* ]]; then
+          continue
+        fi
+        echo ${model_name} >> "${run_benchmark_log_file}"
+        echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> "${run_benchmark_log_file}"
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> adb_run_cmd.txt
+        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
+        if [ $? = 0 ]; then
+            run_result='arm64_gpu: '${model_name}' pass'
+            echo ${run_result} >> ${run_benchmark_result_file}
+        else
+            run_result='arm64_gpu: '${model_name}' failed'
+            echo ${run_result} >> ${run_benchmark_result_file}
+            return 1
+        fi
+        # run benchmark test without clib data
+        #echo ${model_name}
+        echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2' >> "${run_benchmark_log_file}"
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2' >> adb_run_cmd.txt
+        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
+        if [ $? = 0 ]; then
+            run_result='arm64_gpu: '${model_name}' pass'
+            echo ${run_result} >> ${run_benchmark_result_file}
+        else
+            run_result='arm64_gpu: '${model_name}' failed'
+            echo ${run_result} >> ${run_benchmark_result_file}
+            return 1
+        fi
+	#sleep 1
+    done < ${models_tflite_gpu_config}
 }

 # Print start msg before run testcase
@ -397,6 +433,7 @@ models_tflite_posttraining_config=${basepath}/models_tflite_posttraining.cfg
 models_onnx_config=${basepath}/models_onnx.cfg
 models_fp16_config=${basepath}/models_fp16.cfg
 models_mindspore_config=${basepath}/models_mindspore.cfg
+models_tflite_gpu_config=${basepath}/models_tflite_gpu.cfg
 Convert_status=0

 rm -rf ${basepath}/ms_models
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
@ -36,7 +36,7 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we
  size_t dtype_size = sizeof(float);
  if (enable_fp16) {
    ocl_runtime->SetFp16Enable(true);
-    dtype_size = sizeof(float16_t);
+    dtype_size = sizeof(int16_t);
  }
  auto allocator = ocl_runtime->GetAllocator();
  int ci = shape[0];