!2491 add cpu kernel profiling log

Merge pull request !2491 from kisnwang/add-cpu-kernel-profiling
2020-06-23 19:01:14 +08:00 · 2020-06-23 19:01:14 +08:00 · cc5a2408e6
parent 8870956954 dc29cfcbf7
commit cc5a2408e6
3 changed files with 18 additions and 13 deletions
--- a/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc
@ -26,6 +26,7 @@
 #include "device/cpu/cpu_device_address.h"
 #include "utils/context/ms_context.h"
 #include "utils/config_manager.h"
+#include "utils/profile.h"
 #include "common/utils.h"
 #include "session/anf_runtime_algorithm.h"
 #include "session/session_basic.h"
@ -270,6 +271,9 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) {

  auto kernels = kernel_graph->execution_order();
  for (const auto &kernel : kernels) {
+#ifdef ENABLE_PROFILE
+    double start_time = GetTime();
+#endif
    std::vector<kernel::AddressPtr> kernel_inputs;
    std::vector<kernel::AddressPtr> kernel_workspaces;
    std::vector<kernel::AddressPtr> kernel_outputs;
@ -297,6 +301,10 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) {
    if (!ret) {
      MS_LOG(EXCEPTION) << "Launch kernel failed.";
    }
+#ifdef ENABLE_PROFILE
+    double cost_time = GetTime() - start_time;
+    MS_LOG(INFO) << "cpu kernel: " << kernel->fullname_with_scope() << "  costs " << cost_time * 1e6 << " us";
+#endif
  }
  return true;
 }
--- a/mindspore/ccsrc/kernel/cpu/sparse_apply_ftrl_cpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/cpu/sparse_apply_ftrl_cpu_kernel.cc
@ -29,7 +29,7 @@ void ComputeFtrl(MultiThreadComputeParams *input_params, size_t start, size_t en
  auto linear = input_params->linear_;
  auto lr = input_params->lr_;
  auto l1 = input_params->l1_;
-  auto l2 = input_params->l2_;
+  auto l2_plus = 2 * input_params->l2_;
  auto lr_power = input_params->lr_power_;
  auto unique_sparse_grad = input_params->sparse_grad_;
  auto var_first_dim_size = input_params->var_first_dim_size_;
@ -44,21 +44,18 @@ void ComputeFtrl(MultiThreadComputeParams *input_params, size_t start, size_t en
    for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) {
      auto summed_grad = unique_sparse_grad.value_[k];
      auto accum_new = accum[j] + summed_grad * summed_grad;
-      if (lr_power == -0.5) {
-        linear[j] += summed_grad - (std::sqrt(accum_new) - std::sqrt(accum[j])) / lr * var[j];
-      } else {
-        linear[j] += summed_grad - (std::pow(accum_new, -lr_power) - std::pow(accum[j], -lr_power)) / lr * var[j];
-      }
-      auto x = Sign(linear[j]) * l1 - linear[j];
      float y;
      if (lr_power == -0.5) {
-        y = std::sqrt(accum_new) / lr + 2 * l2;
+        y = std::sqrt(accum_new);
+        linear[j] += summed_grad - (y - std::sqrt(accum[j])) / lr * var[j];
      } else {
-        y = std::pow(accum_new, -lr_power) / lr + 2 * l2;
+        y = std::pow(accum_new, -lr_power);
+        linear[j] += summed_grad - (y - std::pow(accum[j], -lr_power)) / lr * var[j];
      }
-      auto pre_shrink = x / y;
-      var[j] = std::fabs(linear[j]) > l1 ? pre_shrink : 0;
      accum[j] = accum_new;
+      auto x = Sign(linear[j]) * l1 - linear[j];
+      y = y / lr + l2_plus;
+      var[j] = std::fabs(linear[j]) > l1 ? x / y : 0;
    }
  }
 }
--- a/mindspore/ccsrc/session/gpu_session.cc
+++ b/mindspore/ccsrc/session/gpu_session.cc
@ -112,10 +112,10 @@ void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
      auto tensor_address = tensor->device_address();
      bool need_sync = false;
      if (ms_context->enable_pynative_infer()) {
-        if (tensor_address.get() == nullptr || tensor_address != device_address) {
+        if (tensor_address == nullptr || tensor_address != device_address) {
          need_sync = true;
        }
-      } else if (tensor->is_dirty()) {
+      } else if (tensor->is_dirty() || tensor_address == nullptr) {
        need_sync = true;
      } else if (tensor_address != device_address) {
        if (tensor_address->DeviceType() == device_address->DeviceType()) {