From 2802572a629b79798b6d1add086fbb70ed7e8d94 Mon Sep 17 00:00:00 2001
From: chenzupeng <chenzupeng@huawei.com>
Date: Thu, 3 Dec 2020 11:04:23 +0800
Subject: [PATCH] fix bug: conv2d performance, profiling mode

---
 .../src/runtime/kernel/opencl/cl/conv2d.cl    | 10 ++++-----
 .../runtime/kernel/opencl/opencl_subgraph.cc  |  1 +
 .../src/runtime/opencl/opencl_executor.cc     | 21 +++++++++----------
 3 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
index f603ef5cf16..7028833ee01 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
@@ -30,11 +30,11 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP
   exp1 = exp(-data);  \
   data = (exp0 - exp1) / (exp0 + exp1);
 
-#define DO_LEAKY_RELU(data)        \
-  if (data.x < 0) data.x *= alpha; \
-  if (data.y < 0) data.y *= alpha; \
-  if (data.z < 0) data.z *= alpha; \
-  if (data.w < 0) data.w *= alpha;
+#define DO_LEAKY_RELU(data)                      \
+  data.x = data.x > 0 ? data.x : data.x * alpha; \
+  data.y = data.y > 0 ? data.y : data.y * alpha; \
+  data.z = data.z > 0 ? data.z : data.z * alpha; \
+  data.w = data.w > 0 ? data.w : data.w * alpha;
 
 __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
                             __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
index 40363eba697..e09619fb637 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
@@ -229,6 +229,7 @@ int OpenCLSubGraph::Init() {
     return ret;
   }
   auto opencl_exec = reinterpret_cast<lite::opencl::OpenCLExecutor *>(executor_);
+  // If tuning_mode is DEFAULT, just malloc memory for reuse.
   ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Run opencl executor failed: " << ret;
diff --git a/mindspore/lite/src/runtime/opencl/opencl_executor.cc b/mindspore/lite/src/runtime/opencl/opencl_executor.cc
index cb1c888d9c8..aca0c32e3be 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_executor.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_executor.cc
@@ -31,8 +31,11 @@ int OpenCLExecutor::RunOrTune(std::vector<Tensor *> &inputs, std::vector<Tensor
                               std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator,
                               const KernelCallBack &before, const KernelCallBack &after, bool is_tune) {
   int ret{RET_OK};
-  ocl_runtime.GetInstance()->SetProfiling(is_tune);
-
+  auto opencl_runtime_ins = ocl_runtime.GetInstance();
+  auto profiling_tmp = opencl_runtime_ins->isProfiling();
+  if (is_tune) {
+    opencl_runtime_ins->SetProfiling(true);
+  }
   kernel::LiteKernelUtil::InitTensorRefCount(kernels);
   for (auto *kernel : kernels) {
     MS_ASSERT(kernel);
@@ -79,6 +82,10 @@ int OpenCLExecutor::RunOrTune(std::vector<Tensor *> &inputs, std::vector<Tensor
         MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name();
         return ret;
       }
+      if (profiling_tmp) {
+        MS_LOG(INFO) << "OpenCl kernel " << kernel->name() << "(" << kernel->type_str()
+                     << ") execute time is: " << op_kernel->GetProfilingTimeMs() << "ms";
+      }
     }
     if (after != nullptr) {
       if (!after(TensorVectorCast(kernel->in_tensors()), TensorVectorCast(kernel->out_tensors()), callbackParam)) {
@@ -92,16 +99,8 @@ int OpenCLExecutor::RunOrTune(std::vector<Tensor *> &inputs, std::vector<Tensor
         MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
       }
     }
-#ifdef Debug
-    MS_LOG(INFO) << "OpenCl kernel " << kernel->name() << "(" << kernel->type_str()
-                 << ") execute time is: " << op_kernel->GetProfilingTimeMs() << "ms";
-#endif
   }
-#ifdef Debug
-  ocl_runtime.GetInstance()->SetProfiling(true);
-#else
-  ocl_runtime.GetInstance()->SetProfiling(false);
-#endif
+  opencl_runtime_ins->SetProfiling(profiling_tmp);
   return ret;
 }
 }  // namespace mindspore::lite::opencl