diff --git a/mindspore/lite/include/ms_tensor.h b/mindspore/lite/include/ms_tensor.h
index f3706af0955..ac0ee269b70 100644
--- a/mindspore/lite/include/ms_tensor.h
+++ b/mindspore/lite/include/ms_tensor.h
@@ -100,6 +100,10 @@ struct CallBackParam {
   std::string node_type; /**< node type argument */
 };
 
+struct GPUCallBackParam : CallBackParam {
+  double execute_time{-1.f};
+};
+
 /// \brief KernelCallBack defined the function pointer for callBack.
 using KernelCallBack = std::function<bool(std::vector<tensor::MSTensor *> inputs,
                                           std::vector<tensor::MSTensor *> outputs, const CallBackParam &opInfo)>;
diff --git a/mindspore/lite/src/runtime/gpu/opencl/opencl_executor.cc b/mindspore/lite/src/runtime/gpu/opencl/opencl_executor.cc
index 70c4447554b..b07ddf1335e 100644
--- a/mindspore/lite/src/runtime/gpu/opencl/opencl_executor.cc
+++ b/mindspore/lite/src/runtime/gpu/opencl/opencl_executor.cc
@@ -32,15 +32,18 @@ int OpenCLExecutor::RunOrTune(const std::vector<Tensor *> &inputs, const std::ve
                               const KernelCallBack &before, const KernelCallBack &after, bool is_tune) {
   int ret{RET_OK};
   auto opencl_runtime_ins = ocl_runtime.GetInstance();
+  if (before != nullptr && after != nullptr) {
+    opencl_runtime_ins->SetProfiling(true);
+  }
   auto profiling_tmp = opencl_runtime_ins->isProfiling();
   if (is_tune) {
     opencl_runtime_ins->SetProfiling(true);
   }
   for (auto *kernel : kernels) {
     MS_ASSERT(kernel);
-    CallBackParam callbackParam;
+    GPUCallBackParam callbackParam;
     callbackParam.node_name = kernel->name();
-
+    callbackParam.node_type = kernel->type_str();
     if (before != nullptr) {
       if (!before(TensorVectorCast(kernel->in_tensors()), TensorVectorCast(kernel->out_tensors()), callbackParam)) {
         MS_LOG(ERROR) << "run kernel before_callback failed, name: " << kernel->name();
@@ -70,9 +73,12 @@ int OpenCLExecutor::RunOrTune(const std::vector<Tensor *> &inputs, const std::ve
         MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name();
         return ret;
       }
-      if (profiling_tmp)
+      if (profiling_tmp) {
+        auto execute_time = op_kernel->GetProfilingTimeMs();
         MS_LOG(INFO) << "OpenCl kernel " << kernel->name() << "(" << kernel->type_str()
                      << ") execute time is: " << op_kernel->GetProfilingTimeMs() << "ms";
+        callbackParam.execute_time = execute_time;
+      }
     }
     ret = kernel->PostProcess();
     if (ret != RET_OK) {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc
index 96508fe0ccc..86dacd00862 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc
@@ -198,12 +198,29 @@ int WinogradOpenCLKernel::Run() {
   ocl_runtime_->RunKernel(kernel_4x4to36_, global_4x4to36_, local_4x4to36_, nullptr, &event_);
 
   MS_LOG(DEBUG) << "winograd kernel1 Running!";
-  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &kernel2_event_);
 
   MS_LOG(DEBUG) << "winograd kernel2 Running!";
   ocl_runtime_->SetKernelArg(kernel_36to4x4_, 1, out_tensors_.front()->data_c());
-  ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_, nullptr, &event_);
+  ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_, nullptr, &kernel3_event_);
   return RET_OK;
 }
 
+double WinogradOpenCLKernel::GetProfilingTimeMs() {
+  if (!ocl_runtime_->isProfiling()) {
+    return MAX_PROFILING_TIME_MILLI_SECOND;
+  }
+  cl_ulong time_start;
+  cl_ulong time_end;
+  event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
+  event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
+  cl_ulong time_ns = time_end - time_start;
+  kernel2_event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
+  kernel2_event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
+  time_ns += time_end - time_start;
+  kernel3_event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
+  kernel3_event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
+  time_ns += time_end - time_start;
+  return static_cast<double>(time_ns) * 1e-6;
+}
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.h
index edff463cbba..cd7b88d5370 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.h
@@ -39,6 +39,7 @@ class WinogradOpenCLKernel : public Conv2DOpenCLKernel {
 
   std::vector<BaseTuningParameter> GenerateTuningParam() override { return {}; }
   int Tune() override { return RET_OK; }
+  double GetProfilingTimeMs() override;
 
  private:
   void BuildKernel() override;
@@ -47,8 +48,10 @@ class WinogradOpenCLKernel : public Conv2DOpenCLKernel {
 
   cl::Kernel kernel_4x4to36_;
   cl::Kernel kernel_36to4x4_;
+  cl::Event kernel2_event_;
   cl::NDRange global_4x4to36_, local_4x4to36_;
   cl::NDRange global_36to4x4_, local_36to4x4_;
+  cl::Event kernel3_event_;
   void *winograd_mem0_{nullptr};
   void *winograd_mem1_{nullptr};
 };
diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
index 8d10f280166..c0a0320725b 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@@ -195,7 +195,7 @@ class OpenCLKernel : public LiteKernel {
   lite::opencl::MemType GetMemType() { return out_mem_type_; }
   void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; }
   OpParameter *GetParameter() { return op_parameter_; }
-  double GetProfilingTimeMs();
+  virtual double GetProfilingTimeMs();
   int DequantWeight();
   void FreeDequantedWeight();
   virtual int InferShape();
diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
index befb436edb4..d8bddb295cc 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
@@ -18,6 +18,7 @@
 #include <set>
 #include <map>
 #include <string>
+#include <utility>
 #include "src/runtime/gpu/opencl/opencl_executor.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/kernel/opencl/kernel/to_format.h"
@@ -467,4 +468,33 @@ int OpenCLSubGraph::Run() {
   }
   return RET_OK;
 }
+
+int OpenCLSubGraph::Run(const KernelCallBack &before, const KernelCallBack &after) {
+  if (executor_ == nullptr) {
+    MS_LOG(ERROR) << "executor is nullptr";
+    return RET_ERROR;
+  }
+  int ret;
+  for (auto &tensor : in_tensors_) {
+    MS_ASSERT(tensor);
+    if (tensor->data_c() == nullptr) {
+      MS_LOG(ERROR) << "OpenCL subgraph input tensor data is null";
+      return RET_ERROR;
+    }
+    ret = allocator_->UnmapBuffer(tensor->data_c());
+    if (ret != RET_OK) {
+      return ret;
+    }
+  }
+
+  ret = executor_->Run(in_tensors_, out_tensors_, nodes_, allocator_, before, after);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Run opencl executor failed: " << ret;
+    return ret;
+  }
+  if (!ocl_runtime_->SyncCommandQueue()) {
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h
index 953e3e4ba91..8bdb24c00cf 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h
@@ -46,7 +46,7 @@ class OpenCLSubGraph : public SubGraphKernel {
   int ReSize() override;
   int ReSize(bool interrupt);
   int Run() override;
-  int Run(const KernelCallBack &before, const KernelCallBack &after) override { return this->Run(); };
+  int Run(const KernelCallBack &before, const KernelCallBack &after) override;
   int InsertOpsPass();
   bool IsSubGraphInferShapeDone();
 
diff --git a/mindspore/lite/tools/benchmark/benchmark.cc b/mindspore/lite/tools/benchmark/benchmark.cc
index c0c445459e4..a9d1454d3d1 100644
--- a/mindspore/lite/tools/benchmark/benchmark.cc
+++ b/mindspore/lite/tools/benchmark/benchmark.cc
@@ -568,10 +568,8 @@ int Benchmark::RunBenchmark() {
   }
 
   auto &cpu_device_ctx = context->device_list_[0];
-  if (flags_->cpu_bind_mode_ == MID_CPU) {
-    cpu_device_ctx.device_info_.cpu_device_info_.cpu_bind_mode_ = MID_CPU;
-  } else if (flags_->cpu_bind_mode_ == HIGHER_CPU) {
-    cpu_device_ctx.device_info_.cpu_device_info_.cpu_bind_mode_ = HIGHER_CPU;
+  if (flags_->cpu_bind_mode_ == MID_CPU || flags_->cpu_bind_mode_ == HIGHER_CPU) {
+    cpu_device_ctx.device_info_.cpu_device_info_.cpu_bind_mode_ = CpuBindMode(flags_->cpu_bind_mode_);
   } else {
     cpu_device_ctx.device_info_.cpu_device_info_.cpu_bind_mode_ = NO_BIND;
   }
@@ -611,9 +609,8 @@ int Benchmark::RunBenchmark() {
       return ret;
     }
   }
-  if (model != nullptr) {
-    model->Free();
-  }
+  if (model != nullptr) model->Free();
+
   ms_inputs_ = session_->GetInputs();
   auto end_prepare_time = GetTimeUs();
   MS_LOG(INFO) << "PrepareTime = " << (end_prepare_time - start_prepare_time) / 1000 << " ms";
@@ -682,147 +679,161 @@ void BenchmarkFlags::InitResizeDimsList() {
   }
 }
 
-int Benchmark::InitCallbackParameter() {
-  if (flags_->time_profiling_) {
-    // before callback
-    before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
-                            const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
-                            const CallBackParam &callParam) {
-      if (before_inputs.empty()) {
-        MS_LOG(INFO) << "The num of beforeInputs is empty";
-      }
-      if (before_outputs.empty()) {
-        MS_LOG(INFO) << "The num of beforeOutputs is empty";
-      }
-      if (op_times_by_type_.find(callParam.node_type) == op_times_by_type_.end()) {
-        op_times_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, 0.0f)));
-      }
-      if (op_times_by_name_.find(callParam.node_name) == op_times_by_name_.end()) {
-        op_times_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, 0.0f)));
-      }
-
-      op_call_times_total_++;
-      op_begin_ = GetTimeUs();
-      return true;
-    };
-
-    // after callback
-    after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
-                           const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
-                           const CallBackParam &call_param) {
-      uint64_t opEnd = GetTimeUs();
-
-      if (after_inputs.empty()) {
-        MS_LOG(INFO) << "The num of after inputs is empty";
-      }
-      if (after_outputs.empty()) {
-        MS_LOG(INFO) << "The num of after outputs is empty";
-      }
-
-      float cost = static_cast<float>(opEnd - op_begin_) / 1000.0f;
-      op_cost_total_ += cost;
-      op_times_by_type_[call_param.node_type].first++;
-      op_times_by_type_[call_param.node_type].second += cost;
-      op_times_by_name_[call_param.node_name].first++;
-      op_times_by_name_[call_param.node_name].second += cost;
-      return true;
-    };
-  } else if (flags_->perf_profiling_) {
-#ifndef ENABLE_ARM64
-    MS_LOG(ERROR) << "Only support perf_profiling on arm64.";
-    return RET_ERROR;
-#else
-    struct perf_event_attr pe, pe2;
-    memset(&pe, 0, sizeof(struct perf_event_attr));
-    memset(&pe2, 0, sizeof(struct perf_event_attr));
-    pe.type = PERF_TYPE_HARDWARE;
-    pe2.type = PERF_TYPE_HARDWARE;
-    pe.size = sizeof(struct perf_event_attr);
-    pe2.size = sizeof(struct perf_event_attr);
-    pe.disabled = 1;
-    pe2.disabled = 1;
-    pe.exclude_kernel = 1;   // don't count kernel
-    pe2.exclude_kernel = 1;  // don't count kernel
-    pe.exclude_hv = 1;       // don't count hypervisor
-    pe2.exclude_hv = 1;      // don't count hypervisor
-    pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
-    pe2.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
-    if (flags_->perf_event_ == "CACHE") {
-      pe.config = PERF_COUNT_HW_CACHE_REFERENCES;
-      pe2.config = PERF_COUNT_HW_CACHE_MISSES;
-    } else if (flags_->perf_event_ == "STALL") {
-      pe.config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND;
-      pe2.config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND;
-    } else {
-      pe.config = PERF_COUNT_HW_CPU_CYCLES;
-      pe2.config = PERF_COUNT_HW_INSTRUCTIONS;
+int Benchmark::InitTimeProfilingCallbackParameter() {
+  // before callback
+  before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
+                          const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
+                          const CallBackParam &callParam) {
+    if (before_inputs.empty()) {
+      MS_LOG(INFO) << "The num of beforeInputs is empty";
     }
-    perf_fd = syscall(__NR_perf_event_open, pe, 0, -1, -1, 0);
-    if (perf_fd == -1) {
-      MS_LOG(ERROR) << "Failed to open perf event " << pe.config;
-      return RET_ERROR;
+    if (before_outputs.empty()) {
+      MS_LOG(INFO) << "The num of beforeOutputs is empty";
     }
-    perf_fd2 = syscall(__NR_perf_event_open, pe2, 0, -1, perf_fd, 0);
-    if (perf_fd2 == -1) {
-      MS_LOG(ERROR) << "Failed to open perf event " << pe2.config;
-      return RET_ERROR;
+    if (op_times_by_type_.find(callParam.node_type) == op_times_by_type_.end()) {
+      op_times_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, 0.0f)));
+    }
+    if (op_times_by_name_.find(callParam.node_name) == op_times_by_name_.end()) {
+      op_times_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, 0.0f)));
     }
-    struct PerfCount zero;
-    zero.value[0] = 0;
-    zero.value[1] = 0;
-    // before callback
-    before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
-                            const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
-                            const CallBackParam &callParam) {
-      if (before_inputs.empty()) {
-        MS_LOG(INFO) << "The num of beforeInputs is empty";
-      }
-      if (before_outputs.empty()) {
-        MS_LOG(INFO) << "The num of beforeOutputs is empty";
-      }
-      if (op_perf_by_type_.find(callParam.node_type) == op_perf_by_type_.end()) {
-        op_perf_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, zero)));
-      }
-      if (op_perf_by_name_.find(callParam.node_name) == op_perf_by_name_.end()) {
-        op_perf_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, zero)));
-      }
 
-      op_call_times_total_++;
-      ioctl(perf_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
-      ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
-      return true;
-    };
+    op_call_times_total_++;
+    op_begin_ = GetTimeUs();
+    return true;
+  };
 
-    // after callback
-    after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
-                           const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
-                           const CallBackParam &call_param) {
-      struct PerfResult res;
-      ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
-      read(perf_fd, &res, sizeof(struct PerfResult));
+  // after callback
+  after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
+                         const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
+                         const CallBackParam &call_param) {
+    uint64_t opEnd = GetTimeUs();
 
-      if (after_inputs.empty()) {
-        MS_LOG(INFO) << "The num of after inputs is empty";
-      }
-      if (after_outputs.empty()) {
-        MS_LOG(INFO) << "The num of after outputs is empty";
-      }
-      float cost1 = static_cast<float>(res.values[0].value);
-      float cost2 = static_cast<float>(res.values[1].value);
-      op_cost_total_ += cost1;
-      op_cost2_total_ += cost2;
-      op_perf_by_type_[call_param.node_type].first++;
-      op_perf_by_type_[call_param.node_type].second.value[0] += cost1;
-      op_perf_by_type_[call_param.node_type].second.value[1] += cost2;
-      op_perf_by_name_[call_param.node_name].first++;
-      op_perf_by_name_[call_param.node_name].second.value[0] += cost1;
-      op_perf_by_name_[call_param.node_name].second.value[1] += cost2;
-      return true;
-    };
-#endif
-  }
+    if (after_inputs.empty()) {
+      MS_LOG(INFO) << "The num of after inputs is empty";
+    }
+    if (after_outputs.empty()) {
+      MS_LOG(INFO) << "The num of after outputs is empty";
+    }
+
+    float cost = static_cast<float>(opEnd - op_begin_) / 1000.0f;
+    if (flags_->device_ == "GPU") {
+      auto gpu_param = reinterpret_cast<const GPUCallBackParam &>(call_param);
+      cost = static_cast<float>(gpu_param.execute_time);
+    }
+    op_cost_total_ += cost;
+    op_times_by_type_[call_param.node_type].first++;
+    op_times_by_type_[call_param.node_type].second += cost;
+    op_times_by_name_[call_param.node_name].first++;
+    op_times_by_name_[call_param.node_name].second += cost;
+    return true;
+  };
   return RET_OK;
 }
+int Benchmark::InitPerfProfilingCallbackParameter() {
+#ifndef ENABLE_ARM64
+  MS_LOG(ERROR) << "Only support perf_profiling on arm64.";
+  return RET_ERROR;
+#else
+  struct perf_event_attr pe, pe2;
+  memset(&pe, 0, sizeof(struct perf_event_attr));
+  memset(&pe2, 0, sizeof(struct perf_event_attr));
+  pe.type = PERF_TYPE_HARDWARE;
+  pe2.type = PERF_TYPE_HARDWARE;
+  pe.size = sizeof(struct perf_event_attr);
+  pe2.size = sizeof(struct perf_event_attr);
+  pe.disabled = 1;
+  pe2.disabled = 1;
+  pe.exclude_kernel = 1;   // don't count kernel
+  pe2.exclude_kernel = 1;  // don't count kernel
+  pe.exclude_hv = 1;       // don't count hypervisor
+  pe2.exclude_hv = 1;      // don't count hypervisor
+  pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
+  pe2.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
+  if (flags_->perf_event_ == "CACHE") {
+    pe.config = PERF_COUNT_HW_CACHE_REFERENCES;
+    pe2.config = PERF_COUNT_HW_CACHE_MISSES;
+  } else if (flags_->perf_event_ == "STALL") {
+    pe.config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND;
+    pe2.config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND;
+  } else {
+    pe.config = PERF_COUNT_HW_CPU_CYCLES;
+    pe2.config = PERF_COUNT_HW_INSTRUCTIONS;
+  }
+  perf_fd = syscall(__NR_perf_event_open, pe, 0, -1, -1, 0);
+  if (perf_fd == -1) {
+    MS_LOG(ERROR) << "Failed to open perf event " << pe.config;
+    return RET_ERROR;
+  }
+  perf_fd2 = syscall(__NR_perf_event_open, pe2, 0, -1, perf_fd, 0);
+  if (perf_fd2 == -1) {
+    MS_LOG(ERROR) << "Failed to open perf event " << pe2.config;
+    return RET_ERROR;
+  }
+  struct PerfCount zero;
+  zero.value[0] = 0;
+  zero.value[1] = 0;
+  // before callback
+  before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
+                          const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
+                          const CallBackParam &callParam) {
+    if (before_inputs.empty()) {
+      MS_LOG(INFO) << "The num of beforeInputs is empty";
+    }
+    if (before_outputs.empty()) {
+      MS_LOG(INFO) << "The num of beforeOutputs is empty";
+    }
+    if (op_perf_by_type_.find(callParam.node_type) == op_perf_by_type_.end()) {
+      op_perf_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, zero)));
+    }
+    if (op_perf_by_name_.find(callParam.node_name) == op_perf_by_name_.end()) {
+      op_perf_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, zero)));
+    }
+
+    op_call_times_total_++;
+    ioctl(perf_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
+    ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
+    return true;
+  };
+
+  // after callback
+  after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
+                         const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
+                         const CallBackParam &call_param) {
+    struct PerfResult res;
+    ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
+    read(perf_fd, &res, sizeof(struct PerfResult));
+
+    if (after_inputs.empty()) {
+      MS_LOG(INFO) << "The num of after inputs is empty";
+    }
+    if (after_outputs.empty()) {
+      MS_LOG(INFO) << "The num of after outputs is empty";
+    }
+    float cost1 = static_cast<float>(res.values[0].value);
+    float cost2 = static_cast<float>(res.values[1].value);
+    op_cost_total_ += cost1;
+    op_cost2_total_ += cost2;
+    op_perf_by_type_[call_param.node_type].first++;
+    op_perf_by_type_[call_param.node_type].second.value[0] += cost1;
+    op_perf_by_type_[call_param.node_type].second.value[1] += cost2;
+    op_perf_by_name_[call_param.node_name].first++;
+    op_perf_by_name_[call_param.node_name].second.value[0] += cost1;
+    op_perf_by_name_[call_param.node_name].second.value[1] += cost2;
+    return true;
+  };
+#endif
+  return RET_OK;
+}
+
+int Benchmark::InitCallbackParameter() {
+  int ret = RET_OK;
+  if (flags_->time_profiling_) {
+    ret = InitTimeProfilingCallbackParameter();
+  } else if (flags_->perf_profiling_) {
+    ret = InitPerfProfilingCallbackParameter();
+  }
+  return ret;
+}
 
 int Benchmark::Init() {
   if (this->flags_ == nullptr) {
@@ -859,13 +870,10 @@ int Benchmark::Init() {
     std::cerr << "numThreads:" << this->flags_->num_threads_ << " must be greater than 0" << std::endl;
     return RET_ERROR;
   }
-
-  if (this->flags_->cpu_bind_mode_ == 2) {
-    MS_LOG(INFO) << "cpuBindMode = MID_CPU";
-    std::cout << "cpuBindMode = MID_CPU" << std::endl;
-  } else if (this->flags_->cpu_bind_mode_ == 1) {
-    MS_LOG(INFO) << "cpuBindMode = HIGHER_CPU";
-    std::cout << "cpuBindMode = HIGHER_CPU" << std::endl;
+  static std::vector<std::string> CPU_BIND_MODE_MAP = {"NO_BIND", "HIGHER_CPU", "MID_CPU"};
+  if (this->flags_->cpu_bind_mode_ >= 1) {
+    MS_LOG(INFO) << "cpuBindMode = " << CPU_BIND_MODE_MAP[this->flags_->cpu_bind_mode_];
+    std::cout << "cpuBindMode = " << CPU_BIND_MODE_MAP[this->flags_->cpu_bind_mode_] << std::endl;
   } else {
     MS_LOG(INFO) << "cpuBindMode = NO_BIND";
     std::cout << "cpuBindMode = NO_BIND" << std::endl;
diff --git a/mindspore/lite/tools/benchmark/benchmark.h b/mindspore/lite/tools/benchmark/benchmark.h
index df298b1e2d9..c62c973d66d 100644
--- a/mindspore/lite/tools/benchmark/benchmark.h
+++ b/mindspore/lite/tools/benchmark/benchmark.h
@@ -163,6 +163,8 @@ class MS_API Benchmark {
                                      int *total_size);
 
   int InitCallbackParameter();
+  int InitTimeProfilingCallbackParameter();
+  int InitPerfProfilingCallbackParameter();
 
   int PrintResult(const std::vector<std::string> &title, const std::map<std::string, std::pair<int, float>> &result);