forked from mindspore-Ecosystem/mindspore
!2491 add cpu kernel profiling log
Merge pull request !2491 from kisnwang/add-cpu-kernel-profiling
This commit is contained in:
commit
cc5a2408e6
|
@ -26,6 +26,7 @@
|
|||
#include "device/cpu/cpu_device_address.h"
|
||||
#include "utils/context/ms_context.h"
|
||||
#include "utils/config_manager.h"
|
||||
#include "utils/profile.h"
|
||||
#include "common/utils.h"
|
||||
#include "session/anf_runtime_algorithm.h"
|
||||
#include "session/session_basic.h"
|
||||
|
@ -270,6 +271,9 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) {
|
|||
|
||||
auto kernels = kernel_graph->execution_order();
|
||||
for (const auto &kernel : kernels) {
|
||||
#ifdef ENABLE_PROFILE
|
||||
double start_time = GetTime();
|
||||
#endif
|
||||
std::vector<kernel::AddressPtr> kernel_inputs;
|
||||
std::vector<kernel::AddressPtr> kernel_workspaces;
|
||||
std::vector<kernel::AddressPtr> kernel_outputs;
|
||||
|
@ -297,6 +301,10 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) {
|
|||
if (!ret) {
|
||||
MS_LOG(EXCEPTION) << "Launch kernel failed.";
|
||||
}
|
||||
#ifdef ENABLE_PROFILE
|
||||
double cost_time = GetTime() - start_time;
|
||||
MS_LOG(INFO) << "cpu kernel: " << kernel->fullname_with_scope() << " costs " << cost_time * 1e6 << " us";
|
||||
#endif
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -29,7 +29,7 @@ void ComputeFtrl(MultiThreadComputeParams *input_params, size_t start, size_t en
|
|||
auto linear = input_params->linear_;
|
||||
auto lr = input_params->lr_;
|
||||
auto l1 = input_params->l1_;
|
||||
auto l2 = input_params->l2_;
|
||||
auto l2_plus = 2 * input_params->l2_;
|
||||
auto lr_power = input_params->lr_power_;
|
||||
auto unique_sparse_grad = input_params->sparse_grad_;
|
||||
auto var_first_dim_size = input_params->var_first_dim_size_;
|
||||
|
@ -44,21 +44,18 @@ void ComputeFtrl(MultiThreadComputeParams *input_params, size_t start, size_t en
|
|||
for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) {
|
||||
auto summed_grad = unique_sparse_grad.value_[k];
|
||||
auto accum_new = accum[j] + summed_grad * summed_grad;
|
||||
if (lr_power == -0.5) {
|
||||
linear[j] += summed_grad - (std::sqrt(accum_new) - std::sqrt(accum[j])) / lr * var[j];
|
||||
} else {
|
||||
linear[j] += summed_grad - (std::pow(accum_new, -lr_power) - std::pow(accum[j], -lr_power)) / lr * var[j];
|
||||
}
|
||||
auto x = Sign(linear[j]) * l1 - linear[j];
|
||||
float y;
|
||||
if (lr_power == -0.5) {
|
||||
y = std::sqrt(accum_new) / lr + 2 * l2;
|
||||
y = std::sqrt(accum_new);
|
||||
linear[j] += summed_grad - (y - std::sqrt(accum[j])) / lr * var[j];
|
||||
} else {
|
||||
y = std::pow(accum_new, -lr_power) / lr + 2 * l2;
|
||||
y = std::pow(accum_new, -lr_power);
|
||||
linear[j] += summed_grad - (y - std::pow(accum[j], -lr_power)) / lr * var[j];
|
||||
}
|
||||
auto pre_shrink = x / y;
|
||||
var[j] = std::fabs(linear[j]) > l1 ? pre_shrink : 0;
|
||||
accum[j] = accum_new;
|
||||
auto x = Sign(linear[j]) * l1 - linear[j];
|
||||
y = y / lr + l2_plus;
|
||||
var[j] = std::fabs(linear[j]) > l1 ? x / y : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -112,10 +112,10 @@ void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
|
|||
auto tensor_address = tensor->device_address();
|
||||
bool need_sync = false;
|
||||
if (ms_context->enable_pynative_infer()) {
|
||||
if (tensor_address.get() == nullptr || tensor_address != device_address) {
|
||||
if (tensor_address == nullptr || tensor_address != device_address) {
|
||||
need_sync = true;
|
||||
}
|
||||
} else if (tensor->is_dirty()) {
|
||||
} else if (tensor->is_dirty() || tensor_address == nullptr) {
|
||||
need_sync = true;
|
||||
} else if (tensor_address != device_address) {
|
||||
if (tensor_address->DeviceType() == device_address->DeviceType()) {
|
||||
|
|
Loading…
Reference in New Issue