From dc29cfcbf707a5ae6d020ff259d537bcc33e3514 Mon Sep 17 00:00:00 2001 From: kswang Date: Tue, 23 Jun 2020 11:31:20 +0800 Subject: [PATCH] add cpu profile time --- .../ccsrc/device/cpu/cpu_kernel_runtime.cc | 8 ++++++++ .../cpu/sparse_apply_ftrl_cpu_kernel.cc | 19 ++++++++----------- mindspore/ccsrc/session/gpu_session.cc | 4 ++-- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc b/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc index 6725dff5247..cfcc1b7c796 100644 --- a/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc +++ b/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc @@ -26,6 +26,7 @@ #include "device/cpu/cpu_device_address.h" #include "utils/context/ms_context.h" #include "utils/config_manager.h" +#include "utils/profile.h" #include "common/utils.h" #include "session/anf_runtime_algorithm.h" #include "session/session_basic.h" @@ -270,6 +271,9 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) { auto kernels = kernel_graph->execution_order(); for (const auto &kernel : kernels) { +#ifdef ENABLE_PROFILE + double start_time = GetTime(); +#endif std::vector kernel_inputs; std::vector kernel_workspaces; std::vector kernel_outputs; @@ -297,6 +301,10 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) { if (!ret) { MS_LOG(EXCEPTION) << "Launch kernel failed."; } +#ifdef ENABLE_PROFILE + double cost_time = GetTime() - start_time; + MS_LOG(INFO) << "cpu kernel: " << kernel->fullname_with_scope() << " costs " << cost_time * 1e6 << " us"; +#endif } return true; } diff --git a/mindspore/ccsrc/kernel/cpu/sparse_apply_ftrl_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/sparse_apply_ftrl_cpu_kernel.cc index 005195ea332..af014022d11 100644 --- a/mindspore/ccsrc/kernel/cpu/sparse_apply_ftrl_cpu_kernel.cc +++ b/mindspore/ccsrc/kernel/cpu/sparse_apply_ftrl_cpu_kernel.cc @@ -29,7 +29,7 @@ void ComputeFtrl(MultiThreadComputeParams *input_params, size_t start, size_t en auto linear = input_params->linear_; auto lr = input_params->lr_; auto l1 = input_params->l1_; - auto l2 = input_params->l2_; + auto l2_plus = 2 * input_params->l2_; auto lr_power = input_params->lr_power_; auto unique_sparse_grad = input_params->sparse_grad_; auto var_first_dim_size = input_params->var_first_dim_size_; @@ -44,21 +44,18 @@ void ComputeFtrl(MultiThreadComputeParams *input_params, size_t start, size_t en for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) { auto summed_grad = unique_sparse_grad.value_[k]; auto accum_new = accum[j] + summed_grad * summed_grad; - if (lr_power == -0.5) { - linear[j] += summed_grad - (std::sqrt(accum_new) - std::sqrt(accum[j])) / lr * var[j]; - } else { - linear[j] += summed_grad - (std::pow(accum_new, -lr_power) - std::pow(accum[j], -lr_power)) / lr * var[j]; - } - auto x = Sign(linear[j]) * l1 - linear[j]; float y; if (lr_power == -0.5) { - y = std::sqrt(accum_new) / lr + 2 * l2; + y = std::sqrt(accum_new); + linear[j] += summed_grad - (y - std::sqrt(accum[j])) / lr * var[j]; } else { - y = std::pow(accum_new, -lr_power) / lr + 2 * l2; + y = std::pow(accum_new, -lr_power); + linear[j] += summed_grad - (y - std::pow(accum[j], -lr_power)) / lr * var[j]; } - auto pre_shrink = x / y; - var[j] = std::fabs(linear[j]) > l1 ? pre_shrink : 0; accum[j] = accum_new; + auto x = Sign(linear[j]) * l1 - linear[j]; + y = y / lr + l2_plus; + var[j] = std::fabs(linear[j]) > l1 ? x / y : 0; } } } diff --git a/mindspore/ccsrc/session/gpu_session.cc b/mindspore/ccsrc/session/gpu_session.cc index e67a9225670..a0a43f2edda 100644 --- a/mindspore/ccsrc/session/gpu_session.cc +++ b/mindspore/ccsrc/session/gpu_session.cc @@ -112,10 +112,10 @@ void GPUSession::LoadInputData(const std::shared_ptr &kernel_graph, auto tensor_address = tensor->device_address(); bool need_sync = false; if (ms_context->enable_pynative_infer()) { - if (tensor_address.get() == nullptr || tensor_address != device_address) { + if (tensor_address == nullptr || tensor_address != device_address) { need_sync = true; } - } else if (tensor->is_dirty()) { + } else if (tensor->is_dirty() || tensor_address == nullptr) { need_sync = true; } else if (tensor_address != device_address) { if (tensor_address->DeviceType() == device_address->DeviceType()) {