From 2802572a629b79798b6d1add086fbb70ed7e8d94 Mon Sep 17 00:00:00 2001 From: chenzupeng Date: Thu, 3 Dec 2020 11:04:23 +0800 Subject: [PATCH] fix bug: conv2d performance, profiling mode --- .../src/runtime/kernel/opencl/cl/conv2d.cl | 10 ++++----- .../runtime/kernel/opencl/opencl_subgraph.cc | 1 + .../src/runtime/opencl/opencl_executor.cc | 21 +++++++++---------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl index f603ef5cf16..7028833ee01 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl @@ -30,11 +30,11 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP exp1 = exp(-data); \ data = (exp0 - exp1) / (exp0 + exp1); -#define DO_LEAKY_RELU(data) \ - if (data.x < 0) data.x *= alpha; \ - if (data.y < 0) data.y *= alpha; \ - if (data.z < 0) data.z *= alpha; \ - if (data.w < 0) data.w *= alpha; +#define DO_LEAKY_RELU(data) \ + data.x = data.x > 0 ? data.x : data.x * alpha; \ + data.y = data.y > 0 ? data.y : data.y * alpha; \ + data.z = data.z > 0 ? data.z : data.z * alpha; \ + data.w = data.w > 0 ? data.w : data.w * alpha; __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad, diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc index 40363eba697..e09619fb637 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc @@ -229,6 +229,7 @@ int OpenCLSubGraph::Init() { return ret; } auto opencl_exec = reinterpret_cast(executor_); + // If tuning_mode is DEFAULT, just malloc memory for reuse. ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true); if (ret != RET_OK) { MS_LOG(ERROR) << "Run opencl executor failed: " << ret; diff --git a/mindspore/lite/src/runtime/opencl/opencl_executor.cc b/mindspore/lite/src/runtime/opencl/opencl_executor.cc index cb1c888d9c8..aca0c32e3be 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_executor.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_executor.cc @@ -31,8 +31,11 @@ int OpenCLExecutor::RunOrTune(std::vector &inputs, std::vector &kernels, Allocator *allocator, const KernelCallBack &before, const KernelCallBack &after, bool is_tune) { int ret{RET_OK}; - ocl_runtime.GetInstance()->SetProfiling(is_tune); - + auto opencl_runtime_ins = ocl_runtime.GetInstance(); + auto profiling_tmp = opencl_runtime_ins->isProfiling(); + if (is_tune) { + opencl_runtime_ins->SetProfiling(true); + } kernel::LiteKernelUtil::InitTensorRefCount(kernels); for (auto *kernel : kernels) { MS_ASSERT(kernel); @@ -79,6 +82,10 @@ int OpenCLExecutor::RunOrTune(std::vector &inputs, std::vectorname(); return ret; } + if (profiling_tmp) { + MS_LOG(INFO) << "OpenCl kernel " << kernel->name() << "(" << kernel->type_str() + << ") execute time is: " << op_kernel->GetProfilingTimeMs() << "ms"; + } } if (after != nullptr) { if (!after(TensorVectorCast(kernel->in_tensors()), TensorVectorCast(kernel->out_tensors()), callbackParam)) { @@ -92,16 +99,8 @@ int OpenCLExecutor::RunOrTune(std::vector &inputs, std::vectorname() << " failed"; } } -#ifdef Debug - MS_LOG(INFO) << "OpenCl kernel " << kernel->name() << "(" << kernel->type_str() - << ") execute time is: " << op_kernel->GetProfilingTimeMs() << "ms"; -#endif } -#ifdef Debug - ocl_runtime.GetInstance()->SetProfiling(true); -#else - ocl_runtime.GetInstance()->SetProfiling(false); -#endif + opencl_runtime_ins->SetProfiling(profiling_tmp); return ret; } } // namespace mindspore::lite::opencl